NVIDIA
diff --git a/‎.github/pull_request_template.md‎
Lines changed: 12 additions & 6 deletions b/‎.github/pull_request_template.md‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎.github/workflows/pr-check.yml‎
Lines changed: 55 additions & 0 deletions b/‎.github/workflows/pr-check.yml‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/gptDecoder.h‎
Lines changed: 0 additions & 1 deletion b/‎cpp/include/tensorrt_llm/runtime/gptDecoder.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp‎
Lines changed: 2 additions & 1 deletion b/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h‎
Lines changed: 4 additions & 1 deletion b/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp‎
Lines changed: 3 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp‎
Lines changed: 3 additions & 56 deletions b/‎cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp‎
Lines changed: 3 additions & 56 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp‎
Lines changed: 0 additions & 9 deletions b/‎cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp‎
Lines changed: 0 additions & 9 deletions
@@ -3,15 +3,21 @@
 <!--
 Please write the PR title by following this template:
 
-[JIRA ticket/NVBugs ID/GitHub issue][fix/feat/doc/infra/...] \<summary of this PR\>
+**[JIRA ticket/NVBugs ID/GitHub issue/None][type] Summary**
 
-For example, assume I have a PR to support a new feature about cache manager for JIRA ticket TRTLLM-1000, it would be like:
+Valid ticket formats:
+  - JIRA ticket: [TRTLLM-1234] or [FOOBAR-123] for other FOOBAR project
+  - NVBugs ID: [https://nvbugs/1234567]
+  - GitHub issue: [#1234]
+  - No ticket: [None]
 
-[TRTLLM-1000][feat] Support a new feature about cache manager
+Valid types (lowercase): [fix], [feat], [doc], [infra], [chore], etc.
 
-Or I have a PR to fix a Llama3 accuracy issue:
-
-[https://nvbugs/1234567][fix] Fix Llama3 accuracy issue
+Examples:
+  - [TRTLLM-1234][feat] Add new feature
+  - [https://nvbugs/1234567][fix] Fix some bugs
+  - [#1234][doc] Update documentation
+  - [None][chore] Minor clean-up
 -->
 
 ## Description
 
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: PR Checks
+
+on:
+  pull_request:
+    types: [opened, edited, synchronize, reopened]
+
+jobs:
+  check-pr-title:
+    name: Check PR Title Format
+    runs-on: ubuntu-latest
+    steps:
+      - name: Validate PR Title Format
+        id: check-pr-title
+        uses: agenthunt/[email protected]
+        continue-on-error: true
+        with:
+          pr-title-regex: "^(\\[(None|[A-Z0-9]+-[0-9]+|#[0-9]+|https:\\/\\/nvbugs\\/[0-9]+)\\])(\\[[a-z0-9]+\\]) (([^ ].*)?[^ ])$"
+          pr-body-regex: ""
+
+      - name: PR Title Format Guide
+        if: steps.check-pr-title.outcome == 'failure'
+        run: |
+          echo "::error::PR title format check failed."
+          echo "Expected PR title format:"
+          echo "  [JIRA ticket/NVBugs ID/GitHub issue/None][type] Summary"
+          echo ""
+          echo "Valid ticket formats:"
+          echo "  - JIRA ticket: [TRTLLM-1234] or [FOOBAR-123] for other FOOBAR project"
+          echo "  - NVBugs ID: [https://nvbugs/1234567]"
+          echo "  - GitHub issue: [#1234]"
+          echo "  - No ticket: [None]"
+          echo ""
+          echo "Valid types (lowercase): [fix], [feat], [doc], [infra], [chore], etc."
+          echo ""
+          echo "Examples:"
+          echo "  - [TRTLLM-1234][feat] Add new feature"
+          echo "  - [https://nvbugs/1234567][fix] Fix some bugs"
+          echo "  - [#1234][doc] Update documentation"
+          echo "  - [None][chore] Minor clean-up"
+          exit 1
@@ -20,7 +20,6 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/decodingInput.h"
 #include "tensorrt_llm/runtime/decodingOutput.h"
-#include "tensorrt_llm/runtime/request.h"
 #include "tensorrt_llm/runtime/samplingConfig.h"
 
 #include <NvInferRuntime.h>
 
@@ -477,9 +477,9 @@ std::map<SizeType32, float> BlockManager::calculateWindowSizeToShare(
         windowSizeToContribution[windowSize] = cacheSizeWeight;
     }
 
-    for (auto const& [windowSize, layers] : windowSizeToLayers)
+    for (auto const& [windowSize, _] : windowSizeToLayers)
     {
-        windowSizeToContribution.at(windowSize) *= windowSize * layers.size();
+        windowSizeToContribution.at(windowSize) *= windowSize;
     }
     auto const windowSizesTotalSum = std::accumulate(windowSizeToContribution.begin(), windowSizeToContribution.end(),
         0.0, [](auto sum, auto const& windowSize) { return sum + windowSize.second; });
 
@@ -55,7 +55,8 @@ XQAKernelRuntimeHashKey getRuntimeHashKeyFromXQAParams(XQAParams const& xqaParam
     // precompiled XQA does not use is_fp8_output as hashing key
     return {xqaParams.kv_cache_data_type, head_size, beam_width, kernel_num_q_heads_over_kv, kernel_m_tilesize,
         xqaParams.paged_kv_cache ? static_cast<unsigned int>(xqaParams.tokens_per_block) : 0, xqaParams.paged_kv_cache,
-        xqaParams.multi_query_tokens, isXqaJit ? xqaParams.is_fp8_output : false};
+        xqaParams.multi_query_tokens, isXqaJit ? xqaParams.is_fp8_output : false,
+        isXqaJit ? std::optional(xqaParams.position_embedding_type) : std::nullopt};
 }
 
 } // namespace tensorrt_llm::kernels
@@ -67,14 +67,15 @@ struct XQAKernelRuntimeHashKey
     bool paged_kv_cache;
     bool multi_query_tokens;
     bool is_fp8_output;
+    std::optional<PositionEmbeddingType> position_embedding_type;
 
     bool operator==(XQAKernelRuntimeHashKey const& other) const
     {
         return kv_data_type == other.kv_data_type && head_size == other.head_size
             && num_q_heads_per_kv == other.num_q_heads_per_kv && beam_size == other.beam_size
             && multi_query_tokens == other.multi_query_tokens && m_tilesize == other.m_tilesize
             && tokens_per_page == other.tokens_per_page && paged_kv_cache == other.paged_kv_cache
-            && is_fp8_output == other.is_fp8_output;
+            && is_fp8_output == other.is_fp8_output && position_embedding_type == other.position_embedding_type;
     }
 };
 
@@ -103,6 +104,8 @@ struct XQAKernelRuntimeHasher
         key ^= s.multi_query_tokens;
         key <<= 1;
         key ^= s.is_fp8_output;
+        key <<= 8;
+        key ^= static_cast<int8_t>(s.position_embedding_type.value_or(static_cast<PositionEmbeddingType>(-1)));
         return key;
     }
 };
 
@@ -37,8 +37,8 @@ using ::tensorrt_llm::kernels::XQAKernelMetaInfo;
 XQAKernelRuntimeHashKey getRuntimeHashKeyFromKernelMeta(XQAKernelMetaInfo const& kernelMeta)
 {
     return {kernelMeta.mKVDataType, kernelMeta.mHeadDim, kernelMeta.mBeamWidth, kernelMeta.mNumQHeadsOverKV,
-        kernelMeta.mMTileSize, kernelMeta.mTokensPerPage, kernelMeta.mPagedKVCache, kernelMeta.mMultiQueryTokens,
-        0 /* xqa jit param is_fp8_output */};
+        kernelMeta.mMTileSize, kernelMeta.mTokensPerPage, kernelMeta.mPagedKVCache, kernelMeta.mMultiQueryTokens, false,
+        std::nullopt};
 }
 
 } // anonymous namespace
 
@@ -97,7 +97,7 @@ class XQAKernelList
             }
             XQAKernelRuntimeHashKey hash_key{kernelMeta.mKVDataType, kernelMeta.mHeadDim, kernelMeta.mBeamWidth,
                 kernelMeta.mNumQHeadsOverKV, kernelMeta.mMTileSize, kernelMeta.mTokensPerPage, kernelMeta.mPagedKVCache,
-                kernelMeta.mMultiQueryTokens, 0 /* xqa jit param is_fp8_output */};
+                kernelMeta.mMultiQueryTokens, false, std::nullopt};
 
             mFunctions.insert(std::make_pair(hash_key, funcInfo));
         }
@@ -128,7 +128,8 @@ class XQAKernelList
         XQAKernelRuntimeHashKey hash_key
             = {xqaParams.kv_cache_data_type, head_size, beam_width, kernel_num_q_heads_over_kv, m_tilesize,
                 xqaParams.paged_kv_cache ? static_cast<unsigned int>(xqaParams.tokens_per_block) : 0,
-                xqaParams.paged_kv_cache, xqaParams.multi_query_tokens, 0 /* xqa jit param is_fp8_output */};
+                xqaParams.paged_kv_cache, xqaParams.multi_query_tokens, 0, /* xqa jit param is_fp8_output */
+                std::nullopt};
         auto const findIter = mFunctions.find(hash_key);
         return findIter != mFunctions.end();
     }
 
@@ -20,18 +20,13 @@
 #include "tensorrt_llm/batch_manager/assignReqSeqSlots.h"
 #include "tensorrt_llm/batch_manager/capacityScheduler.h"
 #include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
-#include "tensorrt_llm/batch_manager/handleContextLogits.h"
-#include "tensorrt_llm/batch_manager/handleGenerationLogits.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
-#include "tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h"
 #include "tensorrt_llm/batch_manager/medusaBuffers.h"
 #include "tensorrt_llm/batch_manager/microBatchScheduler.h"
 #include "tensorrt_llm/batch_manager/pauseRequests.h"
 #include "tensorrt_llm/batch_manager/peftCacheManager.h"
-#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
-#include "tensorrt_llm/batch_manager/updateDecoderBuffers.h"
 #include "tensorrt_llm/nanobind/common/customCasters.h"
 #include "tensorrt_llm/runtime/decoderState.h"
 #include "tensorrt_llm/runtime/torch.h"
@@ -94,48 +89,6 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
             nb::arg("generation_requests"), nb::arg("model_config"), nb::arg("cross_kv_cache_manager") = std::nullopt)
         .def("name", [](AllocateKvCache const&) { return AllocateKvCache::name; });
 
-    nb::class_<HandleContextLogits>(m, HandleContextLogits::name)
-        .def(nb::init<>())
-        .def(
-            "__call__",
-            [](HandleContextLogits const& self, DecoderInputBuffers& inputBuffers, RequestVector const& contextRequests,
-                at::Tensor const& logits, std::vector<tr::SizeType32> const& numContextLogitsVec,
-                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
-                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
-            {
-                return self(inputBuffers, contextRequests, tr::TorchView::of(logits), numContextLogitsVec, modelConfig,
-                    manager, medusaBuffers);
-            },
-            nb::arg("decoder_input_buffers"), nb::arg("context_requests"), nb::arg("logits"),
-            nb::arg("num_context_logits"), nb::arg("model_config"), nb::arg("buffer_manager"),
-            nb::arg("medusa_buffers") = std::nullopt)
-        .def("name", [](HandleContextLogits const&) { return HandleContextLogits::name; });
-
-    nb::class_<HandleGenerationLogits>(m, HandleGenerationLogits::name)
-        .def(nb::init<>())
-        .def(
-            "__call__",
-            [](HandleGenerationLogits const& self, DecoderInputBuffers& inputBuffers,
-                RequestVector const& generationRequests, at::Tensor const& logits, tr::SizeType32 logitsIndex,
-                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
-                OptionalRef<RuntimeBuffers> genRuntimeBuffers = std::nullopt,
-                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
-            {
-                self(inputBuffers, generationRequests, tr::TorchView::of(logits), logitsIndex, modelConfig, manager,
-                    genRuntimeBuffers, medusaBuffers);
-            },
-            nb::arg("decoder_input_buffers"), nb::arg("generation_requests"), nb::arg("logits"),
-            nb::arg("logits_index"), nb::arg("model_config"), nb::arg("buffer_manager"),
-            nb::arg("gen_runtime_buffers") = std::nullopt, nb::arg("medusa_buffers") = std::nullopt)
-        .def("name", [](HandleGenerationLogits const&) { return HandleGenerationLogits::name; });
-
-    nb::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
-        .def(nb::init<>())
-        .def("__call__", &MakeDecodingBatchInputOutput::operator(), nb::arg("decoder_input_buffers"),
-            nb::arg("decoder_state"), nb::arg("model_config"), nb::arg("max_num_sequences"),
-            nb::arg("fused_runtime_buffers") = std::nullopt)
-        .def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });
-
     nb::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
         .def(nb::init<>())
         .def("__call__", &LogitsPostProcessor::operator(), nb::arg("decoder_input_buffers"),
@@ -154,8 +107,9 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
                 DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
                 tensorrt_llm::runtime::CudaStream const& runtimeStream,
                 tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
-                SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt)
+                SizeType32 beamWidth)
             {
+                OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt;
                 auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
                     worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
                     runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
@@ -166,13 +120,6 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
             nb::arg("model_config"), nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("context_requests"),
             nb::arg("buffer_manager"), nb::arg("logits_type"), nb::arg("decoder_input_buffers"),
             nb::arg("decoder_state"), nb::arg("runtime_stream"), nb::arg("decoder_stream"),
-            nb::arg("max_sequence_length"), nb::arg("beam_width"), nb::arg("medusa_buffers") = std::nullopt)
+            nb::arg("max_sequence_length"), nb::arg("beam_width"))
         .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
-
-    nb::class_<UpdateDecoderBuffers>(m, UpdateDecoderBuffers::name)
-        .def(nb::init<>())
-        .def("__call__", &UpdateDecoderBuffers::operator(), nb::arg("model_config"), nb::arg("decoder_output_buffers"),
-            nb::arg("copy_buffer_manager"), nb::arg("decoder_state"), nb::arg("return_log_probs"),
-            nb::arg("decoder_finish_event"))
-        .def("name", [](UpdateDecoderBuffers const&) { return UpdateDecoderBuffers::name; });
 }
@@ -20,11 +20,9 @@
 
 #include "tensorrt_llm/batch_manager/common.h"
 #include "tensorrt_llm/batch_manager/decoderBuffers.h"
-#include "tensorrt_llm/batch_manager/medusaBuffers.h"
 #include "tensorrt_llm/batch_manager/microBatchScheduler.h"
 #include "tensorrt_llm/batch_manager/peftCacheManager.h"
 #include "tensorrt_llm/batch_manager/rnnStateManager.h"
-#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
 #include "tensorrt_llm/batch_manager/sequenceSlotManager.h"
 #include "tensorrt_llm/nanobind/common/bindTypes.h"
 #include "tensorrt_llm/runtime/gptDecoderBatched.h"
@@ -419,13 +417,6 @@ void initBindings(nb::module_& m)
         .def_rw("log_probs_host", &tb::SlotDecoderBuffers::logProbsHost)
         .def_rw("finish_reasons_host", &tb::SlotDecoderBuffers::finishReasonsHost);
 
-    nb::class_<tb::MedusaBuffers>(m, "MedusaBuffers")
-        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&,
-                 runtime::ModelConfig const&, runtime::WorldConfig const&, executor::DecodingConfig const&,
-                 runtime::TllmRuntime const&>(),
-            nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"), nb::arg("model_config"),
-            nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("runtime"));
-
     m.def(
         "add_new_tokens_to_requests",
         [](std::vector<std::shared_ptr<tb::LlmRequest>>& requests,
Original file line number	Diff line number	Diff line change
`@@ -477,9 +477,9 @@ std::map<SizeType32, float> BlockManager::calculateWindowSizeToShare(`
`477`	`477`	`windowSizeToContribution[windowSize] = cacheSizeWeight;`
`478`	`478`	`}`
`479`	`479`
`480`		`- for (auto const& [windowSize, layers] : windowSizeToLayers)`
	`480`	`+ for (auto const& [windowSize, _] : windowSizeToLayers)`
`481`	`481`	`{`
`482`		`- windowSizeToContribution.at(windowSize) = windowSize layers.size();`
	`482`	`+ windowSizeToContribution.at(windowSize) *= windowSize;`
`483`	`483`	`}`
`484`	`484`	`auto const windowSizesTotalSum = std::accumulate(windowSizeToContribution.begin(), windowSizeToContribution.end(),`
`485`	`485`	`0.0, [](auto sum, auto const& windowSize) { return sum + windowSize.second; });`
Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,8 @@ XQAKernelRuntimeHashKey getRuntimeHashKeyFromXQAParams(XQAParams const& xqaParam`
`55`	`55`	`// precompiled XQA does not use is_fp8_output as hashing key`
`56`	`56`	`return {xqaParams.kv_cache_data_type, head_size, beam_width, kernel_num_q_heads_over_kv, kernel_m_tilesize,`
`57`	`57`	`xqaParams.paged_kv_cache ? static_cast<unsigned int>(xqaParams.tokens_per_block) : 0, xqaParams.paged_kv_cache,`
`58`		`- xqaParams.multi_query_tokens, isXqaJit ? xqaParams.is_fp8_output : false};`
	`58`	`+ xqaParams.multi_query_tokens, isXqaJit ? xqaParams.is_fp8_output : false,`
	`59`	`+ isXqaJit ? std::optional(xqaParams.position_embedding_type) : std::nullopt};`
`59`	`60`	`}`
`60`	`61`
`61`	`62`	`} // namespace tensorrt_llm::kernels`
Original file line number	Diff line number	Diff line change
`@@ -37,8 +37,8 @@ using ::tensorrt_llm::kernels::XQAKernelMetaInfo;`
`37`	`37`	`XQAKernelRuntimeHashKey getRuntimeHashKeyFromKernelMeta(XQAKernelMetaInfo const& kernelMeta)`
`38`	`38`	`{`
`39`	`39`	`return {kernelMeta.mKVDataType, kernelMeta.mHeadDim, kernelMeta.mBeamWidth, kernelMeta.mNumQHeadsOverKV,`
`40`		`- kernelMeta.mMTileSize, kernelMeta.mTokensPerPage, kernelMeta.mPagedKVCache, kernelMeta.mMultiQueryTokens,`
`41`		`- 0 /* xqa jit param is_fp8_output */};`
	`40`	`+ kernelMeta.mMTileSize, kernelMeta.mTokensPerPage, kernelMeta.mPagedKVCache, kernelMeta.mMultiQueryTokens, false,`
	`41`	`+ std::nullopt};`
`42`	`42`	`}`
`43`	`43`
`44`	`44`	`} // anonymous namespace`
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ class XQAKernelList`
`97`	`97`	`}`
`98`	`98`	`XQAKernelRuntimeHashKey hash_key{kernelMeta.mKVDataType, kernelMeta.mHeadDim, kernelMeta.mBeamWidth,`
`99`	`99`	`kernelMeta.mNumQHeadsOverKV, kernelMeta.mMTileSize, kernelMeta.mTokensPerPage, kernelMeta.mPagedKVCache,`
`100`		`- kernelMeta.mMultiQueryTokens, 0 /* xqa jit param is_fp8_output */};`
	`100`	`+ kernelMeta.mMultiQueryTokens, false, std::nullopt};`
`101`	`101`
`102`	`102`	`mFunctions.insert(std::make_pair(hash_key, funcInfo));`
`103`	`103`	`}`
`@@ -128,7 +128,8 @@ class XQAKernelList`
`128`	`128`	`XQAKernelRuntimeHashKey hash_key`
`129`	`129`	`= {xqaParams.kv_cache_data_type, head_size, beam_width, kernel_num_q_heads_over_kv, m_tilesize,`
`130`	`130`	`xqaParams.paged_kv_cache ? static_cast<unsigned int>(xqaParams.tokens_per_block) : 0,`
`131`		`- xqaParams.paged_kv_cache, xqaParams.multi_query_tokens, 0 /* xqa jit param is_fp8_output */};`
	`131`	`+ xqaParams.paged_kv_cache, xqaParams.multi_query_tokens, 0, /* xqa jit param is_fp8_output */`
	`132`	`+ std::nullopt};`
`132`	`133`	`auto const findIter = mFunctions.find(hash_key);`
`133`	`134`	`return findIter != mFunctions.end();`
`134`	`135`	`}`