diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000000..281cb2d85d91
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/cutlass"]
+	path = third_party/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
diff --git a/MANIFEST.in b/MANIFEST.in
index 0c897cf147f1..5e218f8a30a2 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,3 +2,4 @@ include LICENSE
 include requirements.txt
 
 recursive-include csrc *
+recursive-include third_party *
diff --git a/csrc/cutlass_extensions/arch/mma.h b/csrc/cutlass_extensions/arch/mma.h
new file mode 100644
index 000000000000..2362da4f7f2d
--- /dev/null
+++ b/csrc/cutlass_extensions/arch/mma.h
@@ -0,0 +1,120 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+#include "cutlass_extensions/weight_only_quant_op.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace arch
+{
+
+// Tag which triggers MMA which will trigger
+struct OpMultiplyAddDequantizeInterleavedBToA;
+
+/*
+  Below we have extra tags to signal what kind of dequantization we want to do
+  (per col, scale only fine grained, finegrained with zero). This still lets us
+  the existing template infrastructure (incl. that in CUTLASS). However, we
+  split out the template below into OpMultiplyAddDequantizeInterleavedBToA along
+  with the quantization op before instantiating the GEMM pieces.
+
+  Note that this is somewhat of a hack, but it SIGNIFICANTLY reduces the amount of
+  code we need to duplicate.
+ */
+struct OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+struct OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+
+// The default just forwards the original operator
+template <typename MmaOp, WeightOnlyQuantOp QuantOp_>
+struct TagOperator
+{
+    using TaggedOperator = MmaOp;
+};
+
+// Specializations below attach more information to the operator
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>
+{
+    using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_percol_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>
+{
+    using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scale;
+};
+
+template <>
+struct TagOperator<OpMultiplyAddDequantizeInterleavedBToA, WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>
+{
+    using TaggedOperator = OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias;
+};
+
+// Here we instantiate some structs to "detag" the tagged operator. It splits it back to the original
+// operator + the extra information. If no extra info was tagged, the dequant op per column scaling
+// as a default.
+template <typename TaggedMmaOp>
+struct DetagOperator
+{
+    using Operator = TaggedMmaOp;
+    static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_percol_scale>
+{
+    using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+    static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scale>
+{
+    using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+    static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
+};
+
+template <>
+struct DetagOperator<OpMultiplyAddDequantizeInterleavedBToA_fine_scalebias>
+{
+    using Operator = OpMultiplyAddDequantizeInterleavedBToA;
+    static constexpr WeightOnlyQuantOp QuantOp = WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS;
+};
+
+} // namespace arch
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/compute_occupancy.h b/csrc/cutlass_extensions/compute_occupancy.h
new file mode 100644
index 000000000000..97bf693e7092
--- /dev/null
+++ b/csrc/cutlass_extensions/compute_occupancy.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime_api.h>
+
+#include "cutlass/device_kernel.h"
+#include "cudaUtils.h"
+
+namespace tensorrt_llm
+{
+namespace cutlass_extensions
+{
+
+template <typename GemmKernel>
+inline int compute_occupancy_for_kernel()
+{
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size > (48 << 10))
+    {
+        cudaFuncAttributes attr;
+        int device = 0;
+        int max_smem_per_block = 0;
+        tensorrt_llm::common::check_cuda_error(cudaGetDevice(&device));
+        tensorrt_llm::common::check_cuda_error(
+            cudaDeviceGetAttribute(&max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+        tensorrt_llm::common::check_cuda_error(cudaFuncGetAttributes(&attr, cutlass::Kernel<GemmKernel>));
+        if (smem_size + attr.sharedSizeBytes >= static_cast<size_t>(max_smem_per_block))
+        {
+            // This should mean that
+            // cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)
+            // wouldn't work. In that case, we return an occupancy of 0. This will cause the heuristic to ignore this
+            // configuration.
+            return 0;
+        }
+    }
+
+    int max_active_blocks = -1;
+    tensorrt_llm::common::check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks, cutlass::Kernel<GemmKernel>, GemmKernel::kThreadCount, smem_size));
+
+    return max_active_blocks;
+}
+
+} // namespace cutlass_extensions
+} // namespace tensorrt_llm
diff --git a/csrc/cutlass_extensions/cudaUtils.h b/csrc/cutlass_extensions/cudaUtils.h
new file mode 100644
index 000000000000..db6fd6e5af5e
--- /dev/null
+++ b/csrc/cutlass_extensions/cudaUtils.h
@@ -0,0 +1,117 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cinttypes>
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <cerrno>
+#include <cstdarg>
+#include <cstring>
+
+namespace tensorrt_llm::common
+{
+
+/* **************************** debug tools ********************************* */
+static const char* _cudaGetErrorEnum(cudaError_t error)
+{
+    return cudaGetErrorString(error);
+}
+
+static const char* _cudaGetErrorEnum(cublasStatus_t error)
+{
+    switch (error)
+    {
+    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "<unknown>";
+}
+
+static std::string vformat(char const* fmt, va_list args)
+{
+    va_list args0;
+    va_copy(args0, args);
+    auto const size = std::vsnprintf(nullptr, 0, fmt, args0);
+    if (size <= 0)
+        return "";
+
+    std::string stringBuf(size, char{});
+    auto const size2 = std::vsnprintf(&stringBuf[0], size + 1, fmt, args);
+    return stringBuf;
+}
+
+static std::string fmtstr(char const* format, ...)
+{
+    va_list args;
+    va_start(args, format);
+    std::string result = vformat(format, args);
+    va_end(args);
+    return result;
+};
+
+// FIXME(woosuk)
+template <typename T>
+void check(T result, char const* const func, const char* const file, int const line)
+{
+    if (result)
+    {
+        throw std::runtime_error(
+            fmtstr("[ERROR] CUDA runtime error in %s: %s %s:%d\n", func, _cudaGetErrorEnum(result), file, line));
+    }
+}
+
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+#define check_cuda_error_2(val, file, line) check((val), #val, file, line)
+
+inline int getSMVersion()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    int sm_major = 0;
+    int sm_minor = 0;
+    check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
+    check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
+    return sm_major * 10 + sm_minor;
+}
+
+} // namespace tensorrt_llm::common
diff --git a/csrc/cutlass_extensions/epilogue/thread/fused_activations.h b/csrc/cutlass_extensions/epilogue/thread/fused_activations.h
new file mode 100644
index 000000000000..2ed13dde1920
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue/thread/fused_activations.h
@@ -0,0 +1,105 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/half.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace epilogue
+{
+namespace thread
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+__forceinline__ __device__ float copysignf_pos(float a, float b)
+{
+    float r;
+    r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
+    return r;
+}
+
+__forceinline__ __device__ float tanh_opt(float x)
+{
+#if (__CUDACC_VER_MAJOR__ < 11) || (__CUDA_ARCH__ < 750)
+    const float exp_val = -1.f * fabs(2 * x);
+    return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
+#else
+    return fast_tanh(x);
+#endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+struct GELU_taylor<float>
+{
+    static const bool kIsHeavy = true;
+
+    CUTLASS_DEVICE
+    float operator()(float const& z) const
+    {
+
+        float k0 = float(0.7978845608028654);
+        float k1 = float(0.044715);
+
+        return float(cutlass::constants::half<float>() * z
+            * (cutlass::constants::one<float>() + tanh_opt(k0 * z * (cutlass::constants::one<float>() + k1 * z * z))));
+    }
+
+    using Params = LinearCombinationGenericParams<float>;
+
+    CUTLASS_DEVICE
+    float operator()(float const& scalar, Params const& params_) const
+    {
+        return this->operator()(scalar);
+    }
+};
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h b/csrc/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h
new file mode 100644
index 000000000000..1781fc3ac94c
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h
@@ -0,0 +1,352 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue visitor for threadblock scoped INT8 GEMMs that uses one scaling factor per row, and one per column.
+
+  original file: 3rdparty/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
+
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_conversion.h"
+#include "tensorrt_llm/common/quantization.h"
+
+namespace tk = tensorrt_llm::common;
+
+namespace cutlass
+{
+namespace epilogue
+{
+namespace threadblock
+{
+
+template <typename ThreadblockShape_, int ThreadCount, typename ScaleTileIterator_, typename OutputTileIterator_,
+    typename ElementAccumulator_, typename ElementCompute_, typename ElementwiseFunctor_, bool UseMasking_ = false>
+class EpilogueVisitorPerRowPerCol
+{
+public:
+    using ThreadblockShape = ThreadblockShape_;
+    static int const kThreadCount = ThreadCount;
+
+    using ScaleTileIterator = ScaleTileIterator_;
+    using OutputTileIterator = OutputTileIterator_;
+    using ElementwiseFunctor = ElementwiseFunctor_;
+
+    static int const kIterations = OutputTileIterator::kIterations;
+    static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+    using ElementOutput = typename OutputTileIterator::Element;
+    using LayoutOutput = cutlass::layout::RowMajor;
+    using ElementAccumulator = ElementAccumulator_;
+
+    using AlphaScaleElementType = typename ScaleTileIterator::Element;
+
+    using ElementCompute = ElementCompute_;
+    using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+    using ComputeFragment = Array<ElementCompute_, kElementsPerAccess>;
+    using OutputVector = Array<ElementOutput, kElementsPerAccess>;
+
+    static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
+    static bool const kHasMultiStepsInRow = (OutputTileIterator::ThreadMap::Iterations::kColumn > 1);
+
+    /// Argument structure
+    struct Arguments
+    {
+
+        typename ElementwiseFunctor::Params elementwise;
+        int64_t batch_stride_alpha;
+        int64_t batch_stride_C;
+        int64_t batch_stride_D;
+
+        //
+        // Methods
+        //
+        Arguments()
+            : batch_stride_alpha(0)
+            , batch_stride_C(0)
+            , batch_stride_D(0)
+        {
+        }
+
+        Arguments(typename ElementwiseFunctor::Params elementwise_)
+            : elementwise(elementwise_)
+            , batch_stride_alpha(0)
+            , batch_stride_C(0)
+            , batch_stride_D(0)
+        {
+        }
+
+        Arguments(typename ElementwiseFunctor::Params elementwise_, int64_t batch_stride_alpha_,
+            int64_t batch_stride_C_, int64_t batch_stride_D_)
+            : elementwise(elementwise_)
+            , batch_stride_alpha(batch_stride_alpha_)
+            , batch_stride_C(batch_stride_C_)
+            , batch_stride_D(batch_stride_D_)
+        {
+        }
+    };
+
+    struct Params
+    {
+
+        typename ElementwiseFunctor::Params elementwise;
+        int64_t batch_stride_alpha;
+        int64_t batch_stride_C;
+        int64_t batch_stride_D;
+
+        //
+        // Methods
+        //
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        CUTLASS_HOST_DEVICE
+        Params(Arguments const& args)
+            : elementwise(args.elementwise)
+            , batch_stride_alpha(args.batch_stride_alpha)
+            , batch_stride_C(args.batch_stride_C)
+            , batch_stride_D(args.batch_stride_D)
+        {
+        }
+    };
+
+    /// Shared storage
+    struct SharedStorage
+    {
+    };
+
+private:
+    Params const& params_;
+    SharedStorage& shared_storage_;
+    MatrixCoord extent_;
+    MatrixCoord extent_real_;
+    ElementwiseFunctor elementwise_;
+
+    const bool per_token_quant_;
+    const bool per_channel_quant_;
+
+    AlphaScaleElementType* ptr_alpha_row_;
+    AlphaScaleElementType* ptr_alpha_col_;
+    ScaleTileIterator iterator_alpha_col_;
+    OutputTileIterator iterator_C_;
+    OutputTileIterator iterator_D_;
+
+    AlphaScaleElementType element_alpha_row_ = 1.0f;
+    AlphaScaleElementType element_alpha_col_ = 1.0f;
+    typename ScaleTileIterator::Fragment fragment_alpha_col_;
+    typename OutputTileIterator::Fragment fragment_C_;
+    typename OutputTileIterator::Fragment fragment_D_;
+
+    ElementAccumulator beta_;
+
+    int column_offset_;
+
+    MatrixCoord thread_offset_;
+
+public:
+    CUTLASS_DEVICE
+    EpilogueVisitorPerRowPerCol(Params const& params, SharedStorage& shared_storage,
+        cutlass::MatrixCoord const& problem_size, int thread_idx, int warp_idx, int lane_idx,
+        typename ScaleTileIterator::Params params_alpha_col, typename OutputTileIterator::Params params_C,
+        typename OutputTileIterator::Params params_D, tk::QuantMode quant_option, AlphaScaleElementType* ptr_alpha_row,
+        AlphaScaleElementType* ptr_alpha_col, typename OutputTileIterator::Element* ptr_C,
+        typename OutputTileIterator::Element* ptr_D,
+        cutlass::MatrixCoord const& threadblock_offset = cutlass::MatrixCoord(0, 0), int column_offset = 0,
+        cutlass::MatrixCoord const& problem_size_real = cutlass::MatrixCoord(0, 0))
+        : params_(params)
+        , shared_storage_(shared_storage)
+        , extent_(problem_size)
+        , elementwise_(params.elementwise)
+        , per_token_quant_(quant_option.hasPerTokenScaling())
+        , per_channel_quant_(quant_option.hasPerChannelScaling())
+        , ptr_alpha_row_(ptr_alpha_row)
+        , ptr_alpha_col_(ptr_alpha_col)
+        , iterator_alpha_col_(params_alpha_col, ptr_alpha_col, problem_size, thread_idx, threadblock_offset)
+        , iterator_C_(params_C, ptr_C, problem_size, thread_idx, threadblock_offset)
+        , iterator_D_(params_D, ptr_D, problem_size, thread_idx, threadblock_offset)
+        , extent_real_(problem_size_real)
+    {
+        beta_ = (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
+
+        if (beta_ == ElementAccumulator())
+        {
+            iterator_C_.clear_mask();
+        }
+
+        if (!per_channel_quant_ && (ptr_alpha_col_ != nullptr))
+        {
+            element_alpha_col_ = *ptr_alpha_col_;
+        }
+
+        if (!per_token_quant_ && (ptr_alpha_row_ != nullptr))
+        {
+            element_alpha_row_ = *ptr_alpha_row_;
+        }
+    }
+
+    /// Helper to indicate split-K behavior
+    CUTLASS_DEVICE
+    void set_k_partition(int split_k_index, ///< Index of this threadblock within split-K partitioned scheme
+        int split_k_slices)
+    {                                       ///< Total number of split-K slices
+    }
+
+    /// Called to set the batch index
+    CUTLASS_DEVICE
+    void set_batch_index(int batch_idx)
+    {
+        iterator_alpha_col_.add_pointer_offset(batch_idx * params_.batch_stride_alpha);
+        iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
+        iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
+    }
+
+    /// Called at the start of the epilogue just before iterating over accumulator slices
+    CUTLASS_DEVICE
+    void begin_epilogue()
+    {
+        if (per_channel_quant_)
+        {
+            iterator_alpha_col_.load(fragment_alpha_col_);
+        }
+    }
+
+    /// Called at the start of one step before starting accumulator exchange
+    CUTLASS_DEVICE
+    void begin_step(int step_idx)
+    {
+        fragment_D_.clear();
+        fragment_C_.clear();
+
+        if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling)
+        {
+            iterator_C_.load(fragment_C_);
+            ++iterator_C_;
+        }
+    }
+
+    /// Called at the start of a row
+    CUTLASS_DEVICE
+    void begin_row(int row_idx)
+    {
+        // load alpha_row in begin_step only when per token(row) scaling is used
+        if (per_token_quant_)
+        {
+            int thread_offset_row
+                = iterator_D_.thread_start_row() + OutputTileIterator::ThreadMap::iteration_offset(row_idx).row();
+
+            arch::global_load<AlphaScaleElementType, sizeof(AlphaScaleElementType)>(
+                element_alpha_row_, ptr_alpha_row_ + thread_offset_row, thread_offset_row < extent_.row());
+        }
+    }
+
+    /// Called after accumulators have been exchanged for each accumulator vector
+    CUTLASS_DEVICE
+    void visit(int iter_idx, int row_idx, int column_idx, int frag_idx, AccumulatorFragment const& accum)
+    {
+
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess> source_converter;
+
+        ComputeFragment result = source_converter(accum);
+        if (per_channel_quant_)
+        {
+            ComputeFragment alpha_col = reinterpret_cast<ComputeFragment*>(&fragment_alpha_col_)[column_idx];
+            result = per_token_channel_scale_accumulator_(result, alpha_col, element_alpha_row_);
+        }
+        else
+        {
+            result = per_token_scale_accumulator_(result, element_alpha_col_, element_alpha_row_);
+        }
+
+        // Convert to the output
+        NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> output_converter;
+        OutputVector& output = reinterpret_cast<OutputVector*>(&fragment_D_)[frag_idx];
+        output = output_converter(result);
+    }
+
+    /// Called at the end of a row
+    CUTLASS_DEVICE
+    void end_row(int row_idx) {}
+
+    /// Called after all accumulator elements have been visited
+    CUTLASS_DEVICE
+    void end_step(int step_idx)
+    {
+
+        iterator_D_.store(fragment_D_);
+        ++iterator_D_;
+    }
+
+    /// Called after all steps have been completed
+    CUTLASS_DEVICE
+    void end_epilogue() {}
+
+private:
+    CUTLASS_DEVICE
+    ComputeFragment per_token_channel_scale_accumulator_(
+        ComputeFragment const& accum, ComputeFragment const& scale_col, AlphaScaleElementType const& scale_row)
+    {
+
+        ComputeFragment result;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < ComputeFragment::kElements; ++i)
+        {
+            result[i] = accum[i] * (scale_col[i] * scale_row);
+        }
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    ComputeFragment per_token_scale_accumulator_(
+        ComputeFragment const& accum, AlphaScaleElementType const& scale_col, AlphaScaleElementType const& scale_row)
+    {
+
+        ComputeFragment result;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < ComputeFragment::kElements; ++i)
+        {
+            result[i] = accum[i] * (scale_col * scale_row);
+        }
+
+        return result;
+    }
+};
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h b/csrc/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
new file mode 100644
index 000000000000..6f26d7901703
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
@@ -0,0 +1,282 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  original file: 3rdparty/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace epilogue
+{
+namespace threadblock
+{
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+/// Partial specialization for bfloat16_t <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <typename ThreadblockShape, typename WarpShape, typename InstructionShape, typename ThreadMap>
+struct DefaultIteratorsTensorOp<cutlass::bfloat16_t, int32_t, 8, ThreadblockShape, WarpShape, InstructionShape,
+    ThreadMap>
+{
+    using WarpTileIterator
+        = cutlass::epilogue::warp::TileIteratorTensorOpMixed<WarpShape, InstructionShape, int32_t, 32, 16, 8, 8>;
+
+    using SharedLoadIterator
+        = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<ThreadMap, int32_t, 32, 16, 8, 8>;
+
+    static int const kFragmentsPerIteration = 2;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <typename ThreadMap_ ///< Thread map (concept: OutputTileThreadMap)
+    >
+class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8>
+{
+public:
+    using ThreadMap = ThreadMap_;
+    using Shape = typename ThreadMap::Shape;
+
+    using Element = int32_t;
+
+    using Layout = layout::RowMajor;
+    using TensorRef = TensorRef<Element, Layout>;
+    using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+    using TensorCoord = MatrixCoord;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    static int const kAlignment = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value / 8;
+
+    static int const kThreads = ThreadMap::kThreads;
+
+    /// Fragment object
+    using Fragment = Array<Element,
+        ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow * ThreadMap::Iterations::kGroup
+            * ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+    /// Memory access size
+    using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess, kAlignment>;
+
+    /// Vector type used for SMEM loads
+    using LoadType = AlignedArray<Element, const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+        const_min(16, kAlignment)>;
+
+    static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+private:
+    //
+    // Data members
+    //
+
+    /// Byte-level pointer
+    LoadType const* pointers_[kLoadsPerAccess];
+
+    /// Stride along adjacent rows in units of LoadType
+    int stride_;
+
+public:
+    //
+    // Methods
+    //
+
+    /// Constructor
+    CUTLASS_DEVICE
+    SharedLoadIteratorMixed(TensorRef ref, int thread_idx)
+        : stride_((ref.stride(0) / LoadType::kElements))
+    {
+
+        TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+        // Initialize pointers
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kLoadsPerAccess; ++i)
+        {
+            pointers_[i] = reinterpret_cast<LoadType const*>(ref.data());
+
+            int col_idx = (thread_offset.column() / kElementsPerAccess) * kLoadsPerAccess;
+            int bank_offset = (col_idx * int(sizeof(LoadType)) / 128) % kLoadsPerAccess;
+
+            col_idx += (bank_offset + i) % kLoadsPerAccess;
+
+            pointers_[i] += thread_offset.row() * stride_ + col_idx;
+        }
+    }
+
+    /// Adds a pointer offset in units of Element
+    CUTLASS_HOST_DEVICE
+    void add_pointer_offset(LongIndex pointer_offset)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kLoadsPerAccess; ++i)
+        {
+            pointers_[i] += pointer_offset / LoadType::kElements;
+        }
+    }
+
+    CUTLASS_DEVICE
+    void add_tile_offset(TensorCoord const& offset)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kLoadsPerAccess; ++i)
+        {
+            pointers_[i]
+                += offset.row() * Shape::kRow * stride_ + offset.column() * Shape::kColumn / LoadType::kElements;
+        }
+    }
+
+    /// Loads a fragment from memory
+    CUTLASS_DEVICE
+    void load_with_pointer_offset(Fragment& frag, Index pointer_offset) const
+    {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster)
+        {
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group)
+            {
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int row = 0; row < ThreadMap::Iterations::kRow; ++row)
+                {
+
+                    int row_ptr_offset = row * ThreadMap::Delta::kRow * stride_
+                        + group * ThreadMap::Delta::kGroup * stride_ + cluster * ThreadMap::Delta::kCluster * stride_
+                        + pointer_offset / LoadType::kElements;
+
+                    int frag_row_idx
+                        = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+                    LoadType* frag_ptr = reinterpret_cast<LoadType*>(&frag);
+
+                    CUTLASS_PRAGMA_UNROLL
+                    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column)
+                    {
+
+                        int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+                        CUTLASS_PRAGMA_UNROLL
+                        for (int v = 0; v < kLoadsPerAccess; ++v)
+                        {
+
+                            int vector_idx
+                                = (column * ThreadMap::Delta::kColumn / kElementsPerAccess * kLoadsPerAccess);
+
+                            LoadType const* memory_pointer = pointers_[v] + row_ptr_offset;
+
+                            frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[vector_idx];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /// Loads a fragment
+    CUTLASS_DEVICE
+    void load(Fragment& frag) const
+    {
+
+        load_with_pointer_offset(frag, 0);
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/epilogue_helpers.h b/csrc/cutlass_extensions/epilogue_helpers.h
new file mode 100644
index 000000000000..54ba2465f76e
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue_helpers.h
@@ -0,0 +1,139 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file epilogue_helpers.h
+ *
+ * This file includes types for the epilogues. The empty structs exist so we can signal to template
+ * code the type of epilogue we want to run, and let the underlying code specify the details such as
+ * element types, accumulator type and elements per vector access.
+ *
+ */
+
+#pragma once
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_silu.h"
+#include "cutlass_extensions/epilogue/thread/fused_activations.h"
+
+namespace tensorrt_llm
+{
+namespace cutlass_extensions
+{
+
+struct EpilogueOpBiasSilu
+{
+};
+
+struct EpilogueOpBiasReLU
+{
+};
+
+struct EpilogueOpBiasFtGelu
+{
+};
+
+struct EpilogueOpDefaultSilu
+{
+};
+
+struct EpilogueOpDefaultReLU
+{
+};
+
+struct EpilogueOpDefaultFtGelu
+{
+};
+
+struct EpilogueOpBias
+{
+};
+
+struct EpilogueOpDefault
+{
+};
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator, typename Op>
+struct Epilogue
+{
+};
+
+constexpr auto BiasScaleMode = cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasSilu>
+{
+    using Op = cutlass::epilogue::thread::LinearCombinationSilu<ElementType, ElementsPerVectorAccess,
+        ElementAccumulator, ElementAccumulator, BiasScaleMode>;
+};
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasReLU>
+{
+    using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess,
+        ElementAccumulator, ElementAccumulator, BiasScaleMode>;
+};
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBiasFtGelu>
+{
+    using Op = cutlass::epilogue::thread::LinearCombinationGeneric<cutlass::epilogue::thread::GELU_taylor, ElementType,
+        ElementsPerVectorAccess, ElementAccumulator, ElementAccumulator, BiasScaleMode,
+        cutlass::FloatRoundStyle::round_to_nearest, true>;
+};
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpBias>
+{
+    using Op = cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+        ElementAccumulator, BiasScaleMode>;
+};
+
+constexpr auto DefaultScaleMode = cutlass::epilogue::thread::ScaleType::Default;
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefaultSilu>
+{
+    using Op = cutlass::epilogue::thread::LinearCombinationSilu<ElementType, ElementsPerVectorAccess,
+        ElementAccumulator, ElementAccumulator, DefaultScaleMode>;
+};
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefaultReLU>
+{
+    using Op = cutlass::epilogue::thread::LinearCombinationRelu<ElementType, ElementsPerVectorAccess,
+        ElementAccumulator, ElementAccumulator, DefaultScaleMode>;
+};
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefaultFtGelu>
+{
+    using Op = cutlass::epilogue::thread::LinearCombinationGeneric<cutlass::epilogue::thread::GELU_taylor, ElementType,
+        ElementsPerVectorAccess, ElementAccumulator, ElementAccumulator, DefaultScaleMode,
+        cutlass::FloatRoundStyle::round_to_nearest, true>;
+};
+
+template <typename ElementType, int ElementsPerVectorAccess, typename ElementAccumulator>
+struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, EpilogueOpDefault>
+{
+    using Op = cutlass::epilogue::thread::LinearCombination<ElementType, ElementsPerVectorAccess, ElementAccumulator,
+        ElementAccumulator, DefaultScaleMode>;
+};
+
+} // namespace cutlass_extensions
+} // namespace tensorrt_llm
diff --git a/csrc/cutlass_extensions/gemm/device/gemm_universal_base_compat.h b/csrc/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
new file mode 100644
index 000000000000..2edd5a228b47
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/device/gemm_universal_base_compat.h
@@ -0,0 +1,438 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and
+    batched array variants.
+*/
+
+#pragma once
+
+// #include <limits>
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/trace.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace device
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+    This is the device layer from CUTLASS 2.10 (SHA - cc85b64cf676c45f98a17e3a47c0aafcf817f088)
+    It is replicated here since we needed to duplicate kernel level APIs for mixed dtype GEMMs
+    and SmoothQuant. The newer device layer is not compatible with these older kernel level APIs.
+
+    Note: While CUTLASS 3.x supports stream-k, none of the kernels in the extensions folder support
+          that feature at the moment.
+  */
+
+template <typename GemmKernel_>
+class GemmUniversalBaseCompat
+{
+public:
+    using GemmKernel = GemmKernel_;
+    using ThreadblockShape = typename GemmKernel::Mma::Shape;
+
+    using ElementA = typename GemmKernel::ElementA;
+    using LayoutA = typename GemmKernel::LayoutA;
+    using TensorRefA = TensorRef<ElementA const, LayoutA>;
+    static ComplexTransform const kTransformA = GemmKernel::kTransformA;
+
+    using ElementB = typename GemmKernel::ElementB;
+    using LayoutB = typename GemmKernel::LayoutB;
+    using TensorRefB = TensorRef<ElementB const, LayoutB>;
+    static ComplexTransform const kTransformB = GemmKernel::kTransformB;
+
+    using ElementC = typename GemmKernel::ElementC;
+    using LayoutC = typename GemmKernel::LayoutC;
+    using TensorRefC = TensorRef<ElementC const, LayoutC>;
+    using TensorRefD = TensorRef<ElementC, LayoutC>;
+
+    using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC;
+
+    using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+    using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+    using Operator = typename GemmKernel::Operator;
+
+    /// Argument structure
+    using Arguments = typename GemmKernel::Arguments;
+
+protected:
+    /// Kernel parameters object
+    typename GemmKernel::Params params_;
+
+protected:
+    /// Private helper to obtain the grid dimensions with fix-up for split-K
+    static void get_grid_shape_(gemm::GemmCoord& grid_tiled_shape, int& gemm_k_size, Arguments const& args)
+    {
+
+        // Determine grid shape
+        ThreadblockSwizzle threadblock_swizzle;
+
+        grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+            args.problem_size, {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, args.batch_count);
+
+        gemm_k_size = args.problem_size.k();
+
+        if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel)
+        {
+
+            int const kAlignK
+                = const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+            gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+            if (gemm_k_size)
+            {
+                grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+            }
+        }
+    }
+
+public:
+    /// Constructs the GEMM.
+    GemmUniversalBaseCompat() {}
+
+    /// Determines whether the GEMM can execute the given problem.
+    static Status can_implement(Arguments const& args)
+    {
+
+        // Determine grid shape
+        cutlass::gemm::GemmCoord grid_tiled_shape;
+        int gemm_k_size = 0;
+
+        get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+        ThreadblockSwizzle threadblock_swizzle;
+        dim3 grid = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+        uint32_t const kGridYZMax = ((1 << (sizeof(uint16_t) * 8)) - 1);
+
+        if (!(grid.y <= kGridYZMax && grid.z <= kGridYZMax))
+        {
+
+            return Status::kErrorInvalidProblem;
+        }
+
+        return GemmKernel::can_implement(args);
+    }
+
+    /// Gets the workspace size
+    static size_t get_workspace_size(Arguments const& args)
+    {
+
+        CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::get_workspace_size()");
+
+        size_t workspace_bytes = 0;
+
+        // Determine grid shape
+        cutlass::gemm::GemmCoord grid_tiled_shape;
+        int gemm_k_size = 0;
+
+        get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+        if (args.mode == GemmUniversalMode::kGemmSplitKParallel)
+        {
+
+            // Split-K parallel always requires a temporary workspace
+            workspace_bytes = sizeof(ElementC) * size_t(args.batch_stride_D) * size_t(grid_tiled_shape.k());
+        }
+        else if (args.mode == GemmUniversalMode::kGemm && grid_tiled_shape.k() > 1)
+        {
+
+            // Serial split-K only requires a temporary workspace if the number of partitions along the
+            // GEMM K dimension is greater than one.
+            workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+        }
+
+        CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+        workspace_bytes += GemmKernel::get_extra_workspace_size(args, grid_tiled_shape);
+
+        return workspace_bytes;
+    }
+
+    /// Computes the grid shape
+    static dim3 get_grid_shape(Arguments const& args)
+    {
+
+        CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::get_grid_shape()");
+
+        ThreadblockSwizzle threadblock_swizzle;
+
+        cutlass::gemm::GemmCoord grid_tiled_shape;
+        int gemm_k_size = 0;
+
+        get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+        dim3 result = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+        CUTLASS_TRACE_HOST("  grid_tiled_shape: " << grid_tiled_shape << "\n"
+                                                  << "  result = {" << result << "}");
+
+        return result;
+    }
+
+    /// Computes the maximum number of active blocks per multiprocessor
+    static int maximum_active_blocks(int smem_capacity = -1)
+    {
+
+        CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::maximum_active_blocks()");
+
+        int max_active_blocks = -1;
+        int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+        CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+        if (smem_size <= (48 << 10))
+        {
+
+            cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                &max_active_blocks, Kernel<GemmKernel>, GemmKernel::kThreadCount, smem_size);
+
+            if (result == cudaSuccess)
+            {
+                CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+                return max_active_blocks;
+            }
+        }
+        else
+        {
+
+            // Query assuming zero shared memory then compute occupancy limit based on SMEM
+            cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                &max_active_blocks, Kernel<GemmKernel>, GemmKernel::kThreadCount, 0);
+
+            if (result != cudaSuccess)
+            {
+
+                CUTLASS_TRACE_HOST(
+                    "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error " << cudaGetErrorString(result));
+
+                return -1;
+            }
+
+            if (smem_capacity < 0)
+            {
+                int device_idx = 0;
+                result = cudaGetDevice(&device_idx);
+
+                if (result != cudaSuccess)
+                {
+                    return -1;
+                }
+
+                cudaDeviceProp properties;
+                result = cudaGetDeviceProperties(&properties, device_idx);
+
+                if (result != cudaSuccess)
+                {
+                    return -1;
+                }
+
+                smem_capacity = static_cast<int>(properties.sharedMemPerMultiprocessor);
+            }
+
+            int occupancy = std::min(max_active_blocks, smem_capacity / smem_size);
+
+            CUTLASS_TRACE_HOST("  occupancy: " << occupancy);
+
+            return occupancy;
+        }
+
+        CUTLASS_TRACE_HOST("  returning internal error");
+
+        return -1;
+    }
+
+    /// Initializes GEMM state from arguments.
+    Status initialize(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr)
+    {
+
+        CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::initialize() - workspace "
+            << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+        size_t workspace_bytes = get_workspace_size(args);
+
+        CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+        if (workspace_bytes)
+        {
+
+            if (!workspace)
+            {
+                CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+
+                return Status::kErrorWorkspaceNull;
+            }
+
+            if (args.mode == GemmUniversalMode::kGemm)
+            {
+                CUTLASS_TRACE_HOST("  clearing device workspace");
+                cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_bytes, stream);
+
+                if (result != cudaSuccess)
+                {
+                    CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+
+                    return Status::kErrorInternal;
+                }
+            }
+        }
+
+        // Get CUDA grid shape
+        cutlass::gemm::GemmCoord grid_tiled_shape;
+        int gemm_k_size = 0;
+
+        get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
+
+        // Initialize the Params structure
+        params_ = typename GemmKernel::Params(args, grid_tiled_shape, gemm_k_size, static_cast<int*>(workspace));
+
+        // Specify shared memory capacity for kernel.
+        int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+        if (smem_size >= (48 << 10))
+        {
+            cudaError_t result
+                = cudaFuncSetAttribute(Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+            if (result != cudaSuccess)
+            {
+                return Status::kErrorInternal;
+            }
+        }
+
+        return Status::kSuccess;
+    }
+
+    /// Lightweight update given a subset of arguments
+    Status update(Arguments const& args, void* workspace = nullptr)
+    {
+
+        CUTLASS_TRACE_HOST("GemmUniversalBaseCompat()::update() - workspace: " << workspace);
+
+        size_t workspace_bytes = get_workspace_size(args);
+
+        if (workspace_bytes && !workspace)
+        {
+            return Status::kErrorWorkspaceNull;
+        }
+
+        params_.update(args, workspace);
+
+        return Status::kSuccess;
+    }
+
+    /// Runs the kernel using initialized state.
+    Status run(cudaStream_t stream = nullptr)
+    {
+        CUTLASS_TRACE_HOST("GemmUniversalBaseCompat::run()");
+
+        //
+        // Configure grid and block dimensions
+        //
+
+        ThreadblockSwizzle threadblock_swizzle;
+
+        dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+        dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+        int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+        //
+        // Launch kernel
+        //
+
+        CUTLASS_TRACE_HOST("  grid: (" << grid << "),  block: (" << block << "),  SMEM: " << smem_size << " bytes");
+
+        // Launch
+        cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+        //
+        // Query for errors
+        //
+        cudaError_t result = cudaGetLastError();
+
+        if (result != cudaSuccess)
+        {
+            CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+            return Status::kErrorInternal;
+        }
+
+        return Status::kSuccess;
+    }
+
+    /// Runs the kernel using initialized state.
+    Status operator()(cudaStream_t stream = nullptr)
+    {
+        return run(stream);
+    }
+
+    /// Runs the kernel using initialized state.
+    Status operator()(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr)
+    {
+
+        Status status = initialize(args, workspace, stream);
+
+        if (status == Status::kSuccess)
+        {
+            status = run(stream);
+        }
+
+        return status;
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h b/csrc/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
new file mode 100644
index 000000000000..1886a253fb00
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h
@@ -0,0 +1,138 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/bfloat16.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass_extensions/arch/mma.h"
+#include "cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h"
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace kernel
+{
+
+template <typename TypeA, typename TypeB, typename arch, typename Enable = void>
+struct MixedGemmArchTraits
+{
+};
+
+template <typename arch>
+struct MixedGemmArchTraits<float, float, arch>
+{
+    static constexpr int Stages = 2;
+    using OperatorClass = cutlass::arch::OpClassSimt;
+    using AccType = float;
+    using LayoutB = cutlass::layout::ColumnMajor;
+
+    static constexpr int ElementsPerAccessA = 1;
+    static constexpr int ElementsPerAccessB = 1;
+    static constexpr int ElementsPerAccessC = 1;
+    static constexpr int ThreadblockK = 8;
+    using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+
+    using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// ========================= Volta Traits ===========================
+// Volta will always dequantize after the global memory load.
+// This will instantiate any HMMA tensorcore kernels for Volta.
+// Note that volta does not have native bfloat support so weights and activations will be casted to fp16
+// and compute will happen in fp16 then will be converted for bf16 output.
+template <typename TypeA, typename TypeB>
+struct MixedGemmArchTraits<TypeA, TypeB, cutlass::arch::Sm70,
+    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value
+        || cutlass::platform::is_same<TypeA, cutlass::bfloat16_t>::value>::type>
+{
+private:
+    using LayoutDetails = LayoutDetailsB<TypeB, cutlass::arch::Sm70>;
+
+public:
+    static constexpr int ThreadblockK = LayoutDetails::ThreadblockK;
+
+    using OperatorClass = cutlass::arch::OpClassTensorOp;
+    using AccType = float;
+    using LayoutB = typename LayoutDetails::Layout;
+
+    static constexpr int ElementsPerAccessA = 128 / cutlass::sizeof_bits<TypeA>::value;
+    static constexpr int ElementsPerAccessB = LayoutDetails::ElementsPerAccess;
+    static constexpr int ElementsPerAccessC = 128 / cutlass::sizeof_bits<TypeA>::value;
+    using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+
+    using Operator = typename LayoutDetails::Operator;
+};
+
+// ======================= Turing Traits ==============================
+// Note that turing does not have native bfloat support so weights and activations will be casted to fp16
+// and compute will happen in fp16 then will be converted for bf16 output.
+template <typename TypeA, typename TypeB>
+struct MixedGemmArchTraits<TypeA, TypeB, cutlass::arch::Sm75,
+    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value
+        || cutlass::platform::is_same<TypeA, cutlass::bfloat16_t>::value>::type>
+{
+private:
+    using LayoutDetails = LayoutDetailsB<TypeB, cutlass::arch::Sm75>;
+
+public:
+    static constexpr int ThreadblockK = LayoutDetails::ThreadblockK;
+
+    using OperatorClass = cutlass::arch::OpClassTensorOp;
+    using AccType = float;
+    using LayoutB = typename LayoutDetails::Layout;
+
+    static constexpr int ElementsPerAccessA = 128 / cutlass::sizeof_bits<TypeA>::value;
+    static constexpr int ElementsPerAccessB = LayoutDetails::ElementsPerAccess;
+    static constexpr int ElementsPerAccessC = 128 / cutlass::sizeof_bits<TypeA>::value;
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+    using Operator = typename LayoutDetails::Operator;
+};
+
+// ======================= Ampere Traits ==============================
+template <typename TypeA, typename TypeB>
+struct MixedGemmArchTraits<TypeA, TypeB, cutlass::arch::Sm80,
+    typename cutlass::platform::enable_if<cutlass::platform::is_same<TypeA, cutlass::half_t>::value
+        || cutlass::platform::is_same<TypeA, cutlass::bfloat16_t>::value>::type>
+{
+private:
+    using LayoutDetails = LayoutDetailsB<TypeB, cutlass::arch::Sm80>;
+
+public:
+    static constexpr int ThreadblockK = LayoutDetails::ThreadblockK;
+
+    using OperatorClass = cutlass::arch::OpClassTensorOp;
+    using AccType = float;
+    using LayoutB = typename LayoutDetails::Layout;
+
+    static constexpr int ElementsPerAccessA = 128 / cutlass::sizeof_bits<TypeA>::value;
+    static constexpr int ElementsPerAccessB = LayoutDetails::ElementsPerAccess;
+    static constexpr int ElementsPerAccessC = 128 / cutlass::sizeof_bits<TypeA>::value;
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+    using Operator = typename LayoutDetails::Operator;
+};
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/gemm/kernel/default_int8_traits.h b/csrc/cutlass_extensions/gemm/kernel/default_int8_traits.h
new file mode 100644
index 000000000000..58b98a015368
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/kernel/default_int8_traits.h
@@ -0,0 +1,57 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace kernel
+{
+
+template <typename arch>
+struct Int8GemmArchTraits
+{
+    using OperatorClass = cutlass::arch::OpClassSimt;
+    using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+};
+
+// ======================= Turing Traits ==============================
+template <>
+struct Int8GemmArchTraits<cutlass::arch::Sm75>
+{
+    using OperatorClass = cutlass::arch::OpClassTensorOp;
+    using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+};
+
+// ======================= Ampere Traits ==============================
+template <>
+struct Int8GemmArchTraits<cutlass::arch::Sm80>
+{
+    using OperatorClass = cutlass::arch::OpClassTensorOp;
+    using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+};
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h b/csrc/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
new file mode 100644
index 000000000000..36ae924eebd2
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h
@@ -0,0 +1,574 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+#include <type_traits>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace kernel
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+template <typename>
+inline constexpr bool dependent_false_v = false;
+}
+
+template <typename Mma_,          ///! Threadblock-scoped matrix multiply-accumulate
+    typename Epilogue_,           ///! Epilogue
+    typename ThreadblockSwizzle_, ///! Threadblock swizzling function
+    typename KernelArch, ///! The Architecture this kernel is compiled for. Used since SIMT kernels lose top-level
+                         /// arch.
+    bool SplitKSerial    ///! If true, code supporting split-K via serial reduction is enabled.
+    >
+struct GemmFpAIntB
+{
+
+    using Mma = Mma_;
+    using Epilogue = Epilogue_;
+    using EpilogueOutputOp = typename Epilogue::OutputOp;
+    using ThreadblockSwizzle = ThreadblockSwizzle_;
+    static bool const kSplitKSerial = SplitKSerial;
+
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Element;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Mma::LayoutC;
+    using ElementScale = ElementC;
+
+    static ComplexTransform const kTransformA = Mma::kTransformA;
+    static ComplexTransform const kTransformB = Mma::kTransformA;
+
+    // Type definitions about the mainloop.
+    using Operator = typename Mma::Operator;
+    using OperatorClass = typename Mma::Operator::OperatorClass;
+    using ThreadblockShape = typename Mma::Shape;
+    using WarpShape = typename Mma::Operator::Shape;
+    using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+    using ArchTag = typename Mma::ArchTag;
+
+    static int const kStages = Mma::kStages;
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    /// Warp count (concept: GemmShape)
+    using WarpCount = typename Mma::WarpCount;
+    static int const kThreadCount = 32 * WarpCount::kCount;
+
+    static constexpr int kInterleave = Mma::IteratorB::Shape::kRow / Mma::Shape::kK;
+
+    /// Parameters structure
+    struct Arguments
+    {
+        GemmUniversalMode mode = GemmUniversalMode::kGemm;
+
+        cutlass::gemm::GemmCoord problem_size;
+        int group_size;
+        typename Mma::IteratorA::TensorRef ref_A;
+        typename Mma::IteratorB::TensorRef ref_B;
+        typename Mma::IteratorScale::TensorRef ref_scale;
+        typename Mma::IteratorScale::TensorRef ref_zero;
+        typename Epilogue::OutputTileIterator::TensorRef ref_C;
+        typename Epilogue::OutputTileIterator::TensorRef ref_D;
+
+        // Control serial split-k
+        int batch_count;
+
+        typename EpilogueOutputOp::Params output_op;
+
+        // For gather+scatter operations
+        int const* gather_A_indices;
+        int const* gather_B_indices;
+        int const* scatter_D_indices;
+
+        // Included so we can use Gemm Universal
+        int batch_stride_D = 0;
+
+        //
+        // Methods
+        //
+
+        CUTLASS_HOST_DEVICE
+        Arguments() {}
+
+        CUTLASS_HOST_DEVICE
+        Arguments(cutlass::gemm::GemmCoord const& problem_size, const int group_size,
+            typename Mma::IteratorA::TensorRef ref_A, typename Mma::IteratorB::TensorRef ref_B,
+            typename Mma::IteratorScale::TensorRef ref_scale, typename Mma::IteratorScale::TensorRef ref_zero,
+            typename Epilogue::OutputTileIterator::TensorRef ref_C,
+            typename Epilogue::OutputTileIterator::TensorRef ref_D, int serial_split_k_factor,
+            typename EpilogueOutputOp::Params output_op = typename EpilogueOutputOp::Params(),
+            int const* gather_A_indices = nullptr, int const* gather_B_indices = nullptr,
+            int const* scatter_D_indices = nullptr)
+            : problem_size(problem_size)
+            , group_size(group_size)
+            , ref_A(ref_A)
+            , ref_B(ref_B)
+            , ref_scale(ref_scale)
+            , ref_zero(ref_zero)
+            , ref_C(ref_C)
+            , ref_D(ref_D)
+            , batch_count(serial_split_k_factor)
+            , output_op(output_op)
+            , gather_A_indices(gather_A_indices)
+            , gather_B_indices(gather_B_indices)
+            , scatter_D_indices(scatter_D_indices)
+        {
+        }
+    };
+
+    /// Parameters structure
+    struct Params
+    {
+        cutlass::gemm::GemmCoord problem_size;
+        int group_size;
+        cutlass::gemm::GemmCoord grid_tiled_shape;
+        int swizzle_log_tile;
+        typename Mma::IteratorA::Params params_A;
+        typename Mma::IteratorA::TensorRef ref_A;
+        typename Mma::IteratorB::Params params_B;
+        typename Mma::IteratorB::TensorRef ref_B;
+        typename Mma::IteratorScale::Params params_scale;
+        typename Mma::IteratorScale::TensorRef ref_scale;
+        typename Mma::IteratorScale::TensorRef ref_zero;
+        typename Epilogue::OutputTileIterator::Params params_C;
+        typename Epilogue::OutputTileIterator::TensorRef ref_C;
+        typename Epilogue::OutputTileIterator::Params params_D;
+        typename Epilogue::OutputTileIterator::TensorRef ref_D;
+        typename EpilogueOutputOp::Params output_op;
+        int* semaphore;
+        int gemm_k_size;
+        // For gather+scatter operations
+        int const* gather_A_indices;
+        int const* gather_B_indices;
+        int const* scatter_D_indices;
+
+        //
+        // Methods
+        //
+
+        CUTLASS_HOST_DEVICE
+        Params()
+            : swizzle_log_tile(0)
+            , semaphore(0)
+            , gemm_k_size(0)
+        {
+        }
+
+        CUTLASS_HOST_DEVICE
+        Params(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape, const int gemm_k_size,
+            void* workspace = nullptr)
+            : problem_size(args.problem_size)
+            , group_size(args.group_size)
+            , grid_tiled_shape(grid_tiled_shape)
+            , swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape))
+            , params_A(args.ref_A.layout())
+            , ref_A(args.ref_A)
+            , params_B(args.ref_B.layout())
+            , ref_B(args.ref_B)
+            , params_scale(args.ref_scale.layout())
+            , ref_scale(args.ref_scale)
+            , ref_zero(args.ref_zero)
+            , params_C(args.ref_C.layout())
+            , ref_C(args.ref_C)
+            , params_D(args.ref_D.layout())
+            , ref_D(args.ref_D)
+            , output_op(args.output_op)
+            , semaphore(static_cast<int*>(workspace))
+            , gemm_k_size(gemm_k_size)
+            , gather_A_indices(args.gather_A_indices)
+            , gather_B_indices(args.gather_B_indices)
+            , scatter_D_indices(args.scatter_D_indices)
+        {
+        }
+    };
+
+    /// Shared memory storage structure
+    union SharedStorage
+    {
+        typename Mma::SharedStorage main_loop;
+        typename Epilogue::SharedStorage epilogue;
+    };
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    GemmFpAIntB() {}
+
+    /// Determines whether kernel satisfies alignment
+    CUTLASS_HOST_DEVICE
+    static Status can_implement(Arguments const& args)
+    {
+
+        static int const kAlignmentA
+            = (platform::is_same<typename Mma::IteratorA::Layout, layout::ColumnMajorInterleaved<32>>::value) ? 32
+            : (platform::is_same<typename Mma::IteratorA::Layout, layout::ColumnMajorInterleaved<64>>::value)
+            ? 64
+            : Mma::IteratorA::AccessType::kElements;
+        static int const kAlignmentB
+            = (platform::is_same<typename Mma::IteratorB::Layout, layout::RowMajorInterleaved<32>>::value) ? 32
+            : (platform::is_same<typename Mma::IteratorB::Layout, layout::RowMajorInterleaved<64>>::value)
+            ? 64
+            : Mma::IteratorB::AccessType::kElements;
+
+        static int const kAlignmentScale = Mma::IteratorScale::AccessType::kElements;
+
+        static int const kAlignmentC = (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                           layout::ColumnMajorInterleaved<32>>::value)
+            ? 32
+            : (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                  layout::ColumnMajorInterleaved<64>>::value)
+            ? 64
+            : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+        if (!TensorRef_aligned(args.ref_A, kAlignmentA))
+        {
+            return Status::kErrorMisalignedOperand;
+        }
+
+        if (!TensorRef_aligned(args.ref_B, kAlignmentB))
+        {
+            return Status::kErrorMisalignedOperand;
+        }
+
+        if (!TensorRef_aligned(args.ref_scale, kAlignmentScale))
+        {
+            return Status::kErrorMisalignedOperand;
+        }
+
+        if (!TensorRef_aligned(args.ref_zero, kAlignmentScale))
+        {
+            return Status::kErrorMisalignedOperand;
+        }
+
+        if (!TensorRef_aligned(args.ref_C, kAlignmentC))
+        {
+            return Status::kErrorMisalignedOperand;
+        }
+
+        if (!TensorRef_aligned(args.ref_D, kAlignmentC))
+        {
+            return Status::kErrorMisalignedOperand;
+        }
+
+        if (!args.ref_scale.good())
+        {
+            return Status::kErrorNotSupported;
+        }
+
+        if constexpr (hasZero(Mma::QuantOp))
+        {
+            if (!args.ref_zero.good())
+            {
+                return Status::kErrorNotSupported;
+            }
+        }
+        else
+        {
+            if (args.ref_zero.good())
+            {
+                return Status::kErrorNotSupported;
+            }
+        }
+
+        if constexpr (isFinegrained(Mma::QuantOp))
+        {
+            if (args.group_size != 64 && args.group_size != 128)
+            {
+                return Status::kErrorNotSupported;
+            }
+        }
+
+        return Status::kSuccess;
+    }
+
+    static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape)
+    {
+
+        return 0;
+    }
+
+    // The dummy template parameter is not used and exists so that we can compile this code using
+    // a standard earlier than C++17. Prior to C++17, fully specialized templates HAD to exists in
+    // a namespace
+    template <bool B, typename dummy = void>
+    struct KernelRunner
+    {
+        CUTLASS_DEVICE
+        static void run_kernel(Params const& params, SharedStorage& shared_storage)
+        {
+            CUTLASS_NOT_IMPLEMENTED();
+        }
+    };
+
+    // Initializes the fine grained scale+bias iterator. Needed since the fine grained iterator
+    // has a different constructor signature than a regular cutlass iterator
+    template <typename IteratorScale, WeightOnlyQuantOp op, std::enable_if_t<isFinegrained(op), bool> = true>
+    CUTLASS_DEVICE static IteratorScale initialize_scale(typename IteratorScale::Params const& params,
+        typename IteratorScale::Pointer pointer_scale, typename IteratorScale::Pointer pointer_zero,
+        typename IteratorScale::TensorCoord extent, int thread_id,
+        typename IteratorScale::TensorCoord const& threadblock_offset, int group_size)
+    {
+
+        return IteratorScale(params, pointer_scale, pointer_zero, extent, thread_id, threadblock_offset, group_size);
+    }
+
+    template <typename IteratorScale, WeightOnlyQuantOp op, std::enable_if_t<!isFinegrained(op), bool> = true>
+    CUTLASS_DEVICE static IteratorScale initialize_scale(typename IteratorScale::Params const& params,
+        typename IteratorScale::Pointer pointer_scale, typename IteratorScale::Pointer pointer_zero,
+        typename IteratorScale::TensorCoord extent, int thread_id,
+        typename IteratorScale::TensorCoord const& threadblock_offset, int group_size)
+    {
+
+        return IteratorScale(params, pointer_scale, extent, thread_id, threadblock_offset);
+    }
+
+    template <typename dummy>
+    struct KernelRunner<true, dummy>
+    {
+        CUTLASS_DEVICE
+        static void run_kernel(Params const& params, SharedStorage& shared_storage)
+        {
+            using LayoutB = typename Mma::IteratorB::Layout;
+            static_assert(platform::is_same<LayoutB, layout::RowMajor>::value && kInterleave == 1
+                    || platform::is_same<LayoutB, layout::ColumnMajor>::value && kInterleave >= 1,
+                "B must be row major/col major OR col major interleaved.");
+
+            // Compute threadblock location
+            ThreadblockSwizzle threadblock_swizzle;
+
+            cutlass::gemm::GemmCoord threadblock_tile_offset
+                = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+            // Early exit if CTA is out of range
+            if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m()
+                || params.grid_tiled_shape.n() <= threadblock_tile_offset.n())
+            {
+
+                return;
+            }
+
+            // Compute initial location in logical coordinates
+            cutlass::MatrixCoord tb_offset_A{
+                threadblock_tile_offset.m() * Mma::Shape::kM,
+                threadblock_tile_offset.k() * params.gemm_k_size,
+            };
+
+            cutlass::MatrixCoord tb_offset_B{threadblock_tile_offset.k() * params.gemm_k_size * kInterleave,
+                threadblock_tile_offset.n() * Mma::Shape::kN / kInterleave};
+
+            typename MatrixCoord::Index fg_row_offset = threadblock_tile_offset.k() * params.gemm_k_size / 64;
+            typename MatrixCoord::Index scale_row_offset = isFinegrained(Mma::QuantOp) ? fg_row_offset : 0;
+            cutlass::MatrixCoord tb_offset_scale{scale_row_offset, threadblock_tile_offset.n() * Mma::Shape::kN};
+
+            // Problem size is a function of threadblock index in the K dimension
+            int problem_size_k = min(params.problem_size.k(), (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+            // Compute threadblock-scoped matrix multiply-add
+            int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+            // Compute position within threadblock
+            int thread_idx = threadIdx.x;
+
+            // Construct iterators to A and B operands
+            typename Mma::IteratorA iterator_A(params.params_A, params.ref_A.data(),
+                {params.problem_size.m(), problem_size_k}, thread_idx, tb_offset_A, params.gather_A_indices);
+
+            typename Mma::IteratorB iterator_B(params.params_B, params.ref_B.data(),
+                {problem_size_k * kInterleave, params.problem_size.n() / kInterleave}, thread_idx, tb_offset_B,
+                params.gather_B_indices);
+
+            typename MatrixCoord::Index scale_row_extent = isFinegrained(Mma::QuantOp) ? problem_size_k / 64 : 1;
+            typename Mma::IteratorScale iterator_scale = initialize_scale<typename Mma::IteratorScale, Mma::QuantOp>(
+                params.params_scale, params.ref_scale.data(), params.ref_zero.data(),
+                {scale_row_extent, params.problem_size.n()}, thread_idx, tb_offset_scale, params.group_size);
+
+            // Broadcast the warp_id computed by lane 0 to ensure dependent code
+            // is compiled as warp-uniform.
+            int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+            int lane_idx = threadIdx.x % 32;
+
+            //
+            // Main loop
+            //
+            // Construct thread-scoped matrix multiply
+            Mma mma(shared_storage.main_loop, params.group_size, thread_idx, warp_idx, lane_idx);
+
+            typename Mma::FragmentC accumulators;
+
+            accumulators.clear();
+
+            if (!kSplitKSerial || gemm_k_iterations > 0)
+            {
+                // Compute threadblock-scoped matrix multiply-add
+                mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_scale, accumulators);
+            }
+
+            //
+            // Epilogue
+            //
+
+            EpilogueOutputOp output_op(params.output_op);
+
+            //
+            // Masked tile iterators constructed from members
+            //
+
+            threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+            // assume identity swizzle
+            MatrixCoord threadblock_offset(
+                threadblock_tile_offset.m() * Mma::Shape::kM, threadblock_tile_offset.n() * Mma::Shape::kN);
+
+            int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+            // Construct the semaphore.
+            Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+            // If performing a reduction via split-K, fetch the initial synchronization
+            if (kSplitKSerial && params.grid_tiled_shape.k() > 1)
+            {
+
+                // Fetch the synchronization lock initially but do not block.
+                semaphore.fetch();
+
+                // Indicate which position in a serial reduction the output operator is currently updating
+                output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+            }
+
+            // Tile iterator loading from source tensor.
+            typename Epilogue::OutputTileIterator iterator_C(params.params_C, params.ref_C.data(),
+                params.problem_size.mn(), thread_idx, threadblock_offset, params.scatter_D_indices);
+
+            // Tile iterator writing to destination tensor.
+            typename Epilogue::OutputTileIterator iterator_D(params.params_D, params.ref_D.data(),
+                params.problem_size.mn(), thread_idx, threadblock_offset, params.scatter_D_indices);
+
+            Epilogue epilogue(shared_storage.epilogue, thread_idx, warp_idx, lane_idx);
+
+            // Wait on the semaphore - this latency may have been covered by iterator construction
+            if (kSplitKSerial && params.grid_tiled_shape.k() > 1)
+            {
+
+                // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+                if (threadblock_tile_offset.k())
+                {
+                    iterator_C = iterator_D;
+                }
+
+                semaphore.wait(threadblock_tile_offset.k());
+            }
+
+            // Execute the epilogue operator to update the destination tensor.
+            epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+            //
+            // Release the semaphore
+            //
+
+            if (kSplitKSerial && params.grid_tiled_shape.k() > 1)
+            {
+
+                int lock = 0;
+                if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1)
+                {
+
+                    // The final threadblock resets the semaphore for subsequent grids.
+                    lock = 0;
+                }
+                else
+                {
+                    // Otherwise, the semaphore is incremented
+                    lock = threadblock_tile_offset.k() + 1;
+                }
+
+                semaphore.release(lock);
+            }
+        }
+    };
+
+    /*
+        To improve compilation speed, we do not compile the device operator if the CUDA_ARCH does not correspond
+        to the ArchTag of the cutlass kernel operator.
+      */
+    /// Executes one GEMM
+    CUTLASS_DEVICE
+    void operator()(Params const& params, SharedStorage& shared_storage)
+    {
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 750)
+        static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm70>::value;
+        KernelRunner<compile_needed>::run_kernel(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
+        static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm75>::value;
+        KernelRunner<compile_needed>::run_kernel(params, shared_storage);
+#elif (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ <= 900)
+        static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm80>::value;
+        KernelRunner<compile_needed>::run_kernel(params, shared_storage);
+#else
+        static_assert(
+            false, "Invalid architecture being compiled. Only Volta+ supported in weight-only quantization kernels.");
+#endif
+#else
+        CUTLASS_NOT_IMPLEMENTED();
+#endif
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h b/csrc/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h
new file mode 100644
index 000000000000..80a4d8560859
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h
@@ -0,0 +1,73 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*! \file
+    \brief Scheduler for grouped GEMM
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+#include "cutlass/matrix_coord.h"
+
+#include "cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h"
+#include "cutlass_extensions/gemm/kernel/moe_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace kernel
+{
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ThreadblockShape, GroupScheduleMode GroupScheduleMode_, int PrefetchTileCount, int ThreadCount,
+    bool Transposed = false>
+struct GemmMoeProblemVisitor
+    : public MoeProblemVisitor<detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>, ThreadblockShape,
+          GroupScheduleMode_, PrefetchTileCount, ThreadCount>
+{
+
+    static bool const kTransposed = Transposed;
+
+    using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
+    using Base
+        = MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
+    using Params = typename Base::Params;
+    using SharedStorage = typename Base::SharedStorage;
+
+    //
+    // Methods
+    //
+    CUTLASS_DEVICE
+    GemmMoeProblemVisitor(Params const& params_, SharedStorage& shared_storage_, int32_t block_idx)
+        : Base(params_, shared_storage_, block_idx)
+    {
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h b/csrc/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h
new file mode 100644
index 000000000000..54602754279f
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/kernel/gemm_with_epilogue_visitor.h
@@ -0,0 +1,545 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief GEMM kernel to support the epilogue visitor model
+    for customized softmax partial reduction epilogue fusion.
+
+    This source file will likely be moved to `include/cutlass/gemm/kernel/` in the future once
+    its usage has been stabilized. For now, it is included in this example to demonstrate
+    some basic output fusion options.
+
+    original file: 3rdparty/cutlass/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
+*/
+
+#pragma once
+
+#include "cutlass/complex.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/trace.h"
+
+#include "cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale.h"
+
+namespace tk = tensorrt_llm::common;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace kernel
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma_,         ///! Threadblock-scoped matrix multiply-accumulate
+    typename Epilogue_,          ///! Epilogue
+    typename ThreadblockSwizzle_ ///! Threadblock swizzling function
+    >
+struct GemmWithEpilogueVisitor
+{
+public:
+    using Mma = Mma_;
+    using Epilogue = Epilogue_;
+    using EpilogueVisitor = typename Epilogue::Visitor;
+    using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using TensorRefB = TensorRef<ElementB, LayoutB>;
+
+    using ElementCompute = typename EpilogueVisitor::ElementCompute;
+    using LayoutAlphaCol = cutlass::layout::RowMajor;
+    using LayoutAlphaRow = cutlass::layout::ColumnMajor;
+    using TensorRefAlphaCol = TensorRef<ElementCompute, LayoutAlphaCol>;
+    using TensorRefAlphaRow = TensorRef<ElementCompute, LayoutAlphaRow>;
+
+    using ElementC = typename EpilogueVisitor::ElementOutput;
+    using LayoutC = typename Epilogue::Layout;
+    using TensorRefC = TensorRef<ElementC, LayoutC>;
+
+    static ComplexTransform const kTransformA = Mma::kTransformA;
+    static ComplexTransform const kTransformB = Mma::kTransformB;
+    using Operator = typename Mma::Operator;
+
+    using OperatorClass = typename Mma::Operator::OperatorClass;
+    using ThreadblockShape = typename Mma::Shape;
+    using WarpShape = typename Mma::Operator::Shape;
+    using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+    using ArchTag = typename Mma::ArchTag;
+    using EpilogueOutputOp =
+        typename Epilogue::Visitor::ElementwiseFunctor; // Define type so GemmUniversalBase doesn't complain
+
+    static int const kStages = Mma::kStages;
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
+
+    /// Warp count (concept: GemmShape)
+    using WarpCount = typename Mma::WarpCount;
+    static int const kThreadCount = 32 * WarpCount::kCount;
+
+    /// Split-K preserves splits that are 128b aligned
+    static int const kSplitKAlignment
+        = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+    //
+    // Structures
+    //
+
+    /// Argument structure
+    struct Arguments
+    {
+
+        //
+        // Data members
+        //
+
+        GemmUniversalMode mode;
+        GemmCoord problem_size;
+        int batch_count;
+
+        TensorRefA ref_A;
+        TensorRefB ref_B;
+        tk::QuantMode quant_option;
+        TensorRefAlphaCol ref_alpha_col;
+        TensorRefAlphaRow ref_alpha_row;
+        TensorRefC ref_C;
+        TensorRefC ref_D;
+
+        int64_t batch_stride_A;
+        int64_t batch_stride_B;
+        int64_t batch_stride_D;
+
+        typename EpilogueVisitor::Arguments epilogue_visitor;
+
+        //
+        // Methods
+        //
+
+        Arguments()
+            : mode(GemmUniversalMode::kGemm)
+            , batch_count(1)
+        {
+        }
+
+        /// constructs an arguments structure
+        Arguments(GemmUniversalMode mode_, GemmCoord problem_size_, int batch_count_, TensorRefA ref_A_,
+            TensorRefB ref_B_, tk::QuantMode quant_option_, TensorRefAlphaCol ref_alpha_col_,
+            TensorRefAlphaRow ref_alpha_row_, TensorRefC ref_C_, TensorRefC ref_D_, int64_t batch_stride_A_,
+            int64_t batch_stride_B_, typename EpilogueVisitor::Arguments epilogue_visitor_)
+            : mode(mode_)
+            , problem_size(problem_size_)
+            , batch_count(batch_count_)
+            , ref_A(ref_A_)
+            , ref_B(ref_B_)
+            , quant_option(quant_option_)
+            , ref_alpha_col(ref_alpha_col_)
+            , ref_alpha_row(ref_alpha_row_)
+            , ref_C(ref_C_)
+            , ref_D(ref_D_)
+            , batch_stride_A(batch_stride_A_)
+            , batch_stride_B(batch_stride_B_)
+            , batch_stride_D(0)
+            , epilogue_visitor(epilogue_visitor_)
+        {
+        }
+    };
+
+    //
+    // Structure for precomputing values in host memory and passing to kernels
+    //
+
+    /// Parameters structure
+    struct Params
+    {
+
+        cutlass::gemm::GemmCoord problem_size;
+        cutlass::gemm::GemmCoord grid_tiled_shape;
+        int swizzle_log_tile;
+
+        typename Mma::IteratorA::Params params_A;
+        typename Mma::IteratorB::Params params_B;
+        typename EpilogueVisitor::ScaleTileIterator::Params params_alpha_col;
+        typename EpilogueVisitor::ScaleTileIterator::Params params_alpha_row;
+        typename EpilogueVisitor::OutputTileIterator::Params params_C;
+        typename EpilogueVisitor::OutputTileIterator::Params params_D;
+
+        GemmUniversalMode mode;
+        int batch_count;
+        int gemm_k_size;
+
+        void* ptr_A;
+        void* ptr_B;
+        tk::QuantMode quant_option;
+        typename EpilogueVisitor::ScaleTileIterator::Element* ptr_alpha_col;
+        typename EpilogueVisitor::ScaleTileIterator::Element* ptr_alpha_row;
+        ElementC* ptr_C;
+        ElementC* ptr_D;
+
+        int64_t batch_stride_A;
+        int64_t batch_stride_B;
+
+        typename EpilogueVisitor::Params epilogue_visitor;
+
+        //
+        // Methods
+        //
+
+        CUTLASS_HOST_DEVICE
+        Params()
+            : swizzle_log_tile(0)
+            , params_A(0)
+            , params_B(0)
+            , params_alpha_col(0)
+            , params_C(0)
+            , params_D(0)
+            , batch_count(0)
+            , gemm_k_size(0)
+            , mode(cutlass::gemm::GemmUniversalMode::kGemm)
+            , ptr_A(nullptr)
+            , ptr_B(nullptr)
+            , ptr_alpha_col(nullptr)
+            , ptr_alpha_row(nullptr)
+            , ptr_C(nullptr)
+            , ptr_D(nullptr)
+            , batch_stride_A(0)
+            , batch_stride_B(0)
+        {
+        }
+
+        Params(
+            Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape_, int gemm_k_size_, int* workspace_)
+            : problem_size(args.problem_size)
+            , swizzle_log_tile(0)
+            , params_A(args.ref_A.layout())
+            , params_B(args.ref_B.layout())
+            , params_alpha_col(args.ref_alpha_col.layout())
+            , params_alpha_row(args.ref_alpha_col.layout())
+            , params_C(args.ref_C.layout())
+            , params_D(args.ref_D.layout())
+            , mode(args.mode)
+            , batch_count(args.batch_count)
+            , gemm_k_size(args.problem_size.k())
+            , ptr_A(args.ref_A.data())
+            , ptr_B(args.ref_B.data())
+            , quant_option(args.quant_option)
+            , ptr_alpha_col(args.ref_alpha_col.data())
+            , ptr_alpha_row(args.ref_alpha_row.data())
+            , ptr_C(args.ref_C.data())
+            , ptr_D(args.ref_D.data())
+            , batch_stride_A(args.batch_stride_A)
+            , batch_stride_B(args.batch_stride_B)
+            , epilogue_visitor(args.epilogue_visitor)
+        {
+
+            ThreadblockSwizzle threadblock_swizzle;
+
+            grid_tiled_shape = threadblock_swizzle.get_tiled_shape(args.problem_size,
+                {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, args.batch_count);
+
+            if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel)
+            {
+
+                int const kAlignK
+                    = const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+                gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+                if (gemm_k_size)
+                {
+                    grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+                }
+            }
+
+            swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+        }
+    };
+
+    /// Shared memory storage structure
+    union SharedStorage
+    {
+
+        typename Mma::SharedStorage main_loop;
+
+        struct
+        {
+            typename Epilogue::SharedStorage epilogue;
+            typename EpilogueVisitor::SharedStorage visitor;
+        } epilogue;
+    };
+
+public:
+    //
+    // Methods
+    //
+
+    CUTLASS_DEVICE
+    GemmWithEpilogueVisitor() {}
+
+    /// Determines whether kernel satisfies alignment
+    static Status can_implement(cutlass::gemm::GemmCoord const& problem_size)
+    {
+
+        CUTLASS_TRACE_HOST("GemmWithEpilogueVisitor::can_implement()");
+
+        static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+        static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+        static int const kAlignmentC = EpilogueVisitor::OutputTileIterator::kElementsPerAccess;
+
+        bool isAMisaligned = false;
+        bool isBMisaligned = false;
+        bool isCMisaligned = false;
+
+        if (platform::is_same<LayoutA, layout::RowMajor>::value)
+        {
+            isAMisaligned = problem_size.k() % kAlignmentA;
+        }
+        else if (platform::is_same<LayoutA, layout::ColumnMajor>::value)
+        {
+            isAMisaligned = problem_size.m() % kAlignmentA;
+        }
+        else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value)
+        {
+            isAMisaligned = problem_size.k() % kAlignmentA;
+        }
+
+        if (platform::is_same<LayoutB, layout::RowMajor>::value)
+        {
+            isBMisaligned = problem_size.n() % kAlignmentB;
+        }
+        else if (platform::is_same<LayoutB, layout::ColumnMajor>::value)
+        {
+            isBMisaligned = problem_size.k() % kAlignmentB;
+        }
+        else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value)
+        {
+            isBMisaligned = problem_size.k() % kAlignmentB;
+        }
+
+        if (platform::is_same<LayoutC, layout::RowMajor>::value)
+        {
+            isCMisaligned = problem_size.n() % kAlignmentC;
+        }
+        else if (platform::is_same<LayoutC, layout::ColumnMajor>::value)
+        {
+            isCMisaligned = problem_size.m() % kAlignmentC;
+        }
+        else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value)
+        {
+            isCMisaligned = problem_size.n() % kAlignmentC;
+        }
+
+        if (isAMisaligned)
+        {
+            CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+            return Status::kErrorMisalignedOperand;
+        }
+
+        if (isBMisaligned)
+        {
+            CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+            return Status::kErrorMisalignedOperand;
+        }
+
+        if (isCMisaligned)
+        {
+            CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+            return Status::kErrorMisalignedOperand;
+        }
+
+        CUTLASS_TRACE_HOST("  returning kSuccess");
+
+        return Status::kSuccess;
+    }
+
+    static Status can_implement(Arguments const& args)
+    {
+        return can_implement(args.problem_size);
+    }
+
+    static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape)
+    {
+
+        return 0;
+    }
+
+#define SPLIT_K_ENABLED 1
+
+    /// Executes one GEMM
+    CUTLASS_DEVICE
+    void operator()(Params const& params, SharedStorage& shared_storage)
+    {
+
+        // Compute threadblock location
+        ThreadblockSwizzle threadblock_swizzle;
+
+        cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+        // Early exit if CTA is out of range
+        if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m()
+            || params.grid_tiled_shape.n() <= threadblock_tile_offset.n())
+        {
+
+            return;
+        }
+
+        int offset_k = 0;
+        int problem_size_k = params.problem_size.k();
+
+        ElementA* ptr_A = static_cast<ElementA*>(params.ptr_A);
+        ElementB* ptr_B = static_cast<ElementB*>(params.ptr_B);
+
+#if SPLIT_K_ENABLED
+        //
+        // Fetch pointers based on mode.
+        //
+        if (params.mode == GemmUniversalMode::kGemm || params.mode == GemmUniversalMode::kGemmSplitKParallel)
+        {
+
+            if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k())
+            {
+
+                problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+            }
+
+            offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+        }
+        else if (params.mode == GemmUniversalMode::kBatched)
+        {
+            ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+            ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+        }
+        else if (params.mode == GemmUniversalMode::kArray)
+        {
+            ptr_A = static_cast<ElementA* const*>(params.ptr_A)[threadblock_tile_offset.k()];
+            ptr_B = static_cast<ElementB* const*>(params.ptr_B)[threadblock_tile_offset.k()];
+        }
+#endif
+
+        // Compute initial location in logical coordinates
+        cutlass::MatrixCoord tb_offset_A{
+            threadblock_tile_offset.m() * Mma::Shape::kM,
+            offset_k,
+        };
+
+        cutlass::MatrixCoord tb_offset_B{offset_k, threadblock_tile_offset.n() * Mma::Shape::kN};
+
+        // Compute position within threadblock
+        int thread_idx = threadIdx.x;
+
+        // Construct iterators to A and B operands
+        typename Mma::IteratorA iterator_A(
+            params.params_A, ptr_A, {params.problem_size.m(), problem_size_k}, thread_idx, tb_offset_A);
+
+        typename Mma::IteratorB iterator_B(
+            params.params_B, ptr_B, {problem_size_k, params.problem_size.n()}, thread_idx, tb_offset_B);
+
+        // Broadcast the warp_id computed by lane 0 to ensure dependent code
+        // is compiled as warp-uniform.
+        int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+        int lane_idx = threadIdx.x % 32;
+
+        //
+        // Main loop
+        //
+
+        // Construct thread-scoped matrix multiply
+        Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+        typename Mma::FragmentC accumulators;
+
+        accumulators.clear();
+
+        // Compute threadblock-scoped matrix multiply-add
+        int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add
+        mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+        //
+        // Masked tile iterators constructed from members
+        //
+
+        threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+        // assume identity swizzle
+        MatrixCoord threadblock_offset(
+            threadblock_tile_offset.m() * Mma::Shape::kM, threadblock_tile_offset.n() * Mma::Shape::kN);
+
+        int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+        //
+        // Construct the epilogue visitor
+        //
+
+        EpilogueVisitor epilogue_visitor(params.epilogue_visitor, shared_storage.epilogue.visitor,
+            params.problem_size.mn(), thread_idx, warp_idx, lane_idx, params.params_alpha_col, params.params_C,
+            params.params_D, params.quant_option, params.ptr_alpha_row, params.ptr_alpha_col, params.ptr_C,
+            params.ptr_D, threadblock_offset, blockIdx.y * params.problem_size.m());
+
+        if (params.mode == GemmUniversalMode::kGemm)
+        {
+            // Indicate which position in a serial reduction the output operator is currently updating
+            epilogue_visitor.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        }
+        else if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray)
+        {
+            epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
+        }
+
+        // Construct the epilogue
+        Epilogue epilogue(shared_storage.epilogue.epilogue, thread_idx, warp_idx, lane_idx);
+
+        // Execute the epilogue operator to update the destination tensor.
+        epilogue(epilogue_visitor, accumulators);
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h b/csrc/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
new file mode 100644
index 000000000000..b8176eb52167
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
@@ -0,0 +1,114 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+  This file exists so that we use the same weight layout for MoE grouped gemm and regular gemm when the weight is
+  quantized. The preprocessing code reads this template to know how to organize the quantized weight matrices
+  to be consumed by CUTLASS.
+
+  Note that for int4, ThreadBlockK MUST be 64.
+
+ */
+
+#pragma once
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass_extensions/arch/mma.h"
+#include "cutlass_extensions/tile_interleaved_layout.h"
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace kernel
+{
+
+template <typename TypeB, typename Arch, typename Enable = void>
+struct LayoutDetailsB
+{
+};
+
+// Volta specialiations. Volta will dequantize before STS, so we need a different operator
+template <typename TypeB>
+struct LayoutDetailsB<TypeB, arch::Sm70>
+{
+    static constexpr int ThreadblockK = 64;
+    using Layout = layout::ColumnMajor;
+    static constexpr int ElementsPerAccess = 8;
+    using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specializations for Turing+ when B is FP16. These are currently only used for MoE networks.
+// TODO - Switch this to column major for weights since gemms should be more performant.
+template <typename Arch>
+struct LayoutDetailsB<half_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type>
+{
+    static constexpr int ThreadblockK = 64;
+    using Layout = layout::ColumnMajor;
+    static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<half_t>::value;
+    using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+template <typename Arch>
+struct LayoutDetailsB<bfloat16_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type>
+{
+    static constexpr int ThreadblockK = 64;
+    using Layout = layout::ColumnMajor;
+    static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<bfloat16_t>::value;
+    using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specializations for Turing+ when B is quantized. These can use the operator OpMultiplyAddDequantizeInterleavedBToA,
+// which signals that we want to dequantize after loading from smem.
+template <typename Arch>
+struct LayoutDetailsB<uint8_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type>
+{
+    static constexpr int ThreadblockK = 64;
+
+private:
+    static constexpr int ElementsPerCacheLine = 128 * 8 / sizeof_bits<uint8_t>::value;
+    static constexpr int ColumnsInterleaved = ElementsPerCacheLine / ThreadblockK;
+
+public:
+    using Layout = layout::ColumnMajorTileInterleave<ThreadblockK, ColumnsInterleaved>;
+    static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<uint8_t>::value;
+    using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA;
+};
+
+template <typename Arch>
+struct LayoutDetailsB<uint4b_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type>
+{
+    static constexpr int ThreadblockK = 64;
+
+private:
+    static constexpr int ElementsPerCacheLine = 128 * 8 / sizeof_bits<uint4b_t>::value;
+    static constexpr int ColumnsInterleaved = ElementsPerCacheLine / ThreadblockK;
+
+public:
+    using Layout = layout::ColumnMajorTileInterleave<ThreadblockK, ColumnsInterleaved>;
+    static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<uint4b_t>::value;
+    using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA;
+};
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h b/csrc/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h
new file mode 100644
index 000000000000..4c5c8cc64f43
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h
@@ -0,0 +1,526 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/complex.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+
+#include "cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h"
+#include "cutlass_extensions/tile_interleaved_layout.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace kernel
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// This section exists to that we can use the same kernel code for regular gemm and dequantizing gemms.
+// It will dispatch to the dequantizing gemm if the Mma type has an Iterator for scales in global.
+template <typename...>
+using void_t = void;
+
+template <typename Mma, typename = void>
+struct use_dq_gemm : platform::false_type
+{
+};
+
+template <typename Mma>
+struct use_dq_gemm<Mma, void_t<typename Mma::IteratorScale>> : platform::true_type
+{
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma_,          ///! Threadblock-scoped matrix multiply-accumulate
+    typename Epilogue_,           ///! Epilogue
+    typename ThreadblockSwizzle_, ///! Threadblock swizzling function
+    typename KernelArch, ///! The Architecture this kernel is compiled for. Used since SIMT kernels lose top-level
+                         /// arch.
+    GroupScheduleMode GroupScheduleMode_ ///! Type of scheduling to perform
+    >
+struct MoeFCGemm
+{
+public:
+    using Mma = Mma_;
+    using Epilogue = Epilogue_;
+    using EpilogueOutputOp = typename Epilogue::OutputOp;
+    using ThreadblockSwizzle = ThreadblockSwizzle_;
+    static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+    static bool const kTransposed = false;
+
+    // Optional transpose
+    using MapArguments = kernel::detail::MapArguments<typename Mma::IteratorA::Element, typename Mma::IteratorA::Layout,
+        Mma::kTransformA, Mma::IteratorA::AccessType::kElements, typename Mma::IteratorB::Element,
+        typename Mma::IteratorB::Layout, Mma::kTransformB, Mma::IteratorB::AccessType::kElements, typename Mma::LayoutC,
+        kTransposed>;
+
+    // Public-facing type definitions related to operand element type, layout, and complex conjugate
+    // operation. Must interact with the 'kTransposed' notion.
+    static_assert(!kTransposed, "Transpose problem not supported");
+    using ElementA = typename MapArguments::ElementA;
+    using LayoutA = typename MapArguments::LayoutA;
+    using ElementB = typename MapArguments::ElementB;
+    using LayoutB = typename MapArguments::LayoutB;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename MapArguments::LayoutC;
+    using ElementScale = ElementC;
+
+    static ComplexTransform const kTransformA = MapArguments::kTransformA;
+    static ComplexTransform const kTransformB = MapArguments::kTransformB;
+
+    // Type definitions about the mainloop.
+    using Operator = typename Mma::Operator;
+    using OperatorClass = typename Mma::Operator::OperatorClass;
+    using ThreadblockShape = typename Mma::Shape;
+    using WarpShape = typename Mma::Operator::Shape;
+    using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+    using ArchTag = typename Mma::ArchTag;
+
+    static int const kStages = Mma::kStages;
+    static int const kAlignmentA = MapArguments::kAlignmentA;
+    static int const kAlignmentB = MapArguments::kAlignmentB;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    /// Warp count (concept: GemmShape)
+    using WarpCount = typename Mma::WarpCount;
+    static int const kThreadCount = 32 * WarpCount::kCount;
+
+    using ProblemVisitor
+        = GemmMoeProblemVisitor<ThreadblockShape, kGroupScheduleMode, kThreadCount, kThreadCount, kTransposed>;
+
+    //
+    // Structures
+    //
+
+    /// Argument structure
+    struct Arguments
+    {
+
+        //
+        // Data members
+        //
+
+        int problem_count;
+        int threadblock_count;
+        int group_size;
+
+        typename EpilogueOutputOp::Params output_op;
+
+        ElementA* ptr_A;
+        ElementB* ptr_B;
+        ElementScale* weight_scales;
+        ElementC* ptr_C;
+        ElementC* ptr_D;
+
+        int64_t* total_rows_before_expert;
+        int64_t gemm_n;
+        int64_t gemm_k;
+
+        // Only used by device-level operator
+        GemmCoord* host_problem_sizes;
+
+        //
+        // Methods
+        //
+
+        /// Default ctor
+        CUTLASS_HOST_DEVICE
+        Arguments()
+            : problem_count(0)
+            , threadblock_count(0)
+            , ptr_A(nullptr)
+            , ptr_B(nullptr)
+            , weight_scales(nullptr)
+            , ptr_C(nullptr)
+            , ptr_D(nullptr)
+            , total_rows_before_expert(nullptr)
+            , gemm_n(0)
+            , gemm_k(0)
+            , host_problem_sizes(nullptr)
+        {
+        }
+
+        /// Ctor
+        CUTLASS_HOST_DEVICE
+        Arguments(int problem_count, int threadblock_count, int group_size, typename EpilogueOutputOp::Params output_op,
+            const ElementA* ptr_A, const ElementB* ptr_B, const ElementScale* weight_scales, const ElementC* ptr_C,
+            ElementC* ptr_D, int64_t* total_rows_before_expert, int64_t gemm_n, int64_t gemm_k,
+            GemmCoord* host_problem_sizes = nullptr)
+            : problem_count(problem_count)
+            , threadblock_count(threadblock_count)
+            , group_size(group_size)
+            , output_op(output_op)
+            , ptr_A(const_cast<ElementA*>(ptr_A))
+            , ptr_B(const_cast<ElementB*>(ptr_B))
+            , weight_scales(const_cast<ElementScale*>(weight_scales))
+            , ptr_C(const_cast<ElementC*>(ptr_C))
+            , ptr_D(ptr_D)
+            , total_rows_before_expert(total_rows_before_expert)
+            , gemm_n(gemm_n)
+            , gemm_k(gemm_k)
+            , host_problem_sizes(nullptr)
+        {
+            if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value)
+            {
+                assert(weight_scales);
+            }
+        }
+    };
+
+    //
+    // Structure for precomputing values in host memory and passing to kernels
+    //
+
+    /// Parameters structure
+    struct Params
+    {
+
+        typename ProblemVisitor::Params problem_visitor;
+        int threadblock_count;
+        int group_size;
+
+        typename EpilogueOutputOp::Params output_op;
+
+        ElementA* ptr_A;
+        ElementB* ptr_B;
+        ElementScale* weight_scales;
+        ElementC* ptr_C;
+        ElementC* ptr_D;
+
+        //
+        // Methods
+        //
+
+        CUTLASS_HOST_DEVICE
+        Params()
+            : ptr_A(nullptr)
+            , ptr_B(nullptr)
+            , weight_scales(nullptr)
+            , ptr_C(nullptr)
+            , ptr_D(nullptr)
+        {
+        }
+
+        CUTLASS_HOST_DEVICE
+        Params(Arguments const& args, void* workspace = nullptr, int tile_count = 0)
+            : problem_visitor(
+                args.total_rows_before_expert, args.gemm_n, args.gemm_k, args.problem_count, workspace, tile_count)
+            , threadblock_count(args.threadblock_count)
+            , group_size(args.group_size)
+            , output_op(args.output_op)
+            , ptr_A(args.ptr_A)
+            , ptr_B(args.ptr_B)
+            , weight_scales(args.weight_scales)
+            , ptr_C(args.ptr_C)
+            , ptr_D(args.ptr_D)
+        {
+        }
+
+        CUTLASS_HOST_DEVICE
+        void update(Arguments const& args, void* workspace = nullptr, int tile_count = 0)
+        {
+
+            problem_visitor = typename ProblemVisitor::Params(
+                args.total_rows_before_expert, args.gemm_n, args.gemm_k, args.problem_count, workspace, tile_count);
+            threadblock_count = args.threadblock_count;
+            output_op = args.output_op;
+            ptr_A = args.ptr_A;
+            ptr_B = args.ptr_B;
+            weight_scales = args.weight_scales;
+            ptr_C = args.ptr_C;
+            ptr_D = args.ptr_D;
+        }
+    };
+
+    /// Shared memory storage structure
+    union SharedStorage
+    {
+        typename ProblemVisitor::SharedStorage problem_visitor;
+        typename Mma::SharedStorage main_loop;
+        typename Epilogue::SharedStorage epilogue;
+    };
+
+public:
+    //
+    // Methods
+    //
+
+    CUTLASS_DEVICE
+    MoeFCGemm() {}
+
+    /// Determines whether kernel satisfies alignment
+    static Status can_implement(cutlass::gemm::GemmCoord const& problem_size)
+    {
+        return Status::kSuccess;
+    }
+
+    static Status can_implement(Arguments const& args)
+    {
+        if (platform::is_same<uint8_t, ElementB>::value || platform::is_same<uint4b_t, ElementB>::value)
+        {
+            if (args.weight_scales == nullptr)
+            {
+                CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - weight scales are required for uint8_t and uint4b_t");
+                return Status::kInvalid;
+            }
+        }
+        else if (args.weight_scales != nullptr)
+        {
+            CUTLASS_TRACE_HOST(
+                "MoeFCGemm::can_implement() - weight scales are ignored for all types except uint8_t and uint4b_t");
+            return Status::kInvalid;
+        }
+        else if (args.group_size != args.gemm_k)
+        {
+            CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - scale shape should be (1, gemm_n)");
+            return Status::kInvalid;
+        }
+        // Handle the case the input is too short
+        else if (args.gemm_n < Mma::IteratorB::AccessType::kElements)
+        {
+            CUTLASS_TRACE_HOST("MoeFCGemm::can_implement() - gemm_n is smaller than the input alignment");
+            return Status::kInvalid;
+        }
+        return Status::kSuccess;
+    }
+
+    static size_t get_extra_workspace_size(Arguments const& args, cutlass::gemm::GemmCoord const& grid_tiled_shape)
+    {
+
+        return 0;
+    }
+
+    // The dummy template parameter is not used and exists so that we can compile this code using
+    // a standard earlier than C++17. Prior to C++17, fully specialized templates HAD to exists in
+    // a namespace
+    template <bool B, typename dummy = void>
+    struct KernelRunner
+    {
+        CUTLASS_DEVICE
+        static void run_kernel(Params const& params, SharedStorage& shared_storage)
+        {
+            CUTLASS_NOT_IMPLEMENTED();
+        }
+    };
+
+    template <typename dummy>
+    struct KernelRunner<true, dummy>
+    {
+        CUTLASS_DEVICE
+        static void run_kernel(Params const& params, SharedStorage& shared_storage)
+        {
+            //
+            // These types shadow the type-level definitions and support the ability to implement
+            // a 'transposed' GEMM that computes the transposed problems.
+            //
+            using ElementA = typename Mma::IteratorA::Element;
+            using LayoutA = typename Mma::IteratorA::Layout;
+            using ElementB = typename Mma::IteratorB::Element;
+            using LayoutB = typename Mma::IteratorB::Layout;
+            using ElementC = typename Epilogue::OutputTileIterator::Element;
+            using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+            static constexpr int kInterleave = Mma::IteratorB::Shape::kRow / Mma::Shape::kK;
+            static_assert(platform::is_same<LayoutB, layout::RowMajor>::value && kInterleave == 1
+                    || platform::is_same<LayoutB, layout::ColumnMajor>::value && kInterleave >= 1,
+                "B must be row major/col major OR col major interleaved.");
+
+            //
+            // Problem visitor.
+            //
+            ProblemVisitor problem_visitor(params.problem_visitor, shared_storage.problem_visitor, blockIdx.x);
+
+            const int64_t gemm_k = params.problem_visitor.gemm_k;
+            const int64_t gemm_n = params.problem_visitor.gemm_n;
+            int64_t bytes_per_expert_matrix = (gemm_k * gemm_n / 8) * cutlass::sizeof_bits<ElementB>::value;
+
+            // Outer 'persistent' loop to iterate over tiles
+            int loop = 0;
+            while (problem_visitor.next_tile())
+            {
+                loop++;
+
+                GemmCoord problem_size = problem_visitor.problem_size();
+                int32_t problem_idx = problem_visitor.problem_index();
+                int32_t cta_idx = int32_t(problem_visitor.threadblock_idx());
+
+                GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+                cutlass::gemm::GemmCoord threadblock_offset(
+                    int(cta_idx / grid_shape.n()) * Mma::Shape::kM, int(cta_idx % grid_shape.n()) * Mma::Shape::kN, 0);
+
+                // Load element pointers. Exchange pointers and strides if working on the transpose
+                const int64_t rows_to_jump
+                    = problem_idx == 0 ? 0 : params.problem_visitor.last_row_for_problem[problem_idx - 1];
+                ElementA* ptr_A = reinterpret_cast<ElementA*>(params.ptr_A) + rows_to_jump * gemm_k;
+                typename LayoutA::LongIndex ldm_A = gemm_k;
+
+                char* byte_ptr_B = ((char*) params.ptr_B) + problem_idx * bytes_per_expert_matrix;
+                ElementB* ptr_B = reinterpret_cast<ElementB*>(byte_ptr_B);
+                typename LayoutB::LongIndex ldm_B
+                    = platform::is_same<layout::RowMajor, LayoutB>::value ? gemm_n : gemm_k * kInterleave;
+
+                // Compute initial location in logical coordinates
+                cutlass::MatrixCoord tb_offset_A{
+                    threadblock_offset.m(),
+                    0,
+                };
+
+                cutlass::MatrixCoord tb_offset_B{0, threadblock_offset.n() / kInterleave};
+
+                cutlass::MatrixCoord tb_offset_scale{0, threadblock_offset.n()};
+
+                // Compute position within threadblock
+                int thread_idx = threadIdx.x;
+
+                // Construct iterators to A and B operands
+                typename Mma::IteratorA iterator_A(
+                    LayoutA(ldm_A), ptr_A, {problem_size.m(), problem_size.k()}, thread_idx, tb_offset_A);
+
+                typename Mma::IteratorB iterator_B(LayoutB(ldm_B), ptr_B,
+                    {problem_size.k() * kInterleave, problem_size.n() / kInterleave}, thread_idx, tb_offset_B);
+
+                typename Mma::FragmentC accumulators;
+
+                accumulators.clear();
+
+                // Broadcast the warp_id computed by lane 0 to ensure dependent code
+                // is compiled as warp-uniform.
+                int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+                int lane_idx = threadIdx.x % 32;
+
+                //
+                // Matrix multiply phase
+                //
+
+                // Construct thread-scoped matrix multiply
+                auto CreateMMA = [&]()
+                {
+                    if constexpr (use_dq_gemm<Mma>::value)
+                        return Mma(shared_storage.main_loop, params.group_size, thread_idx, warp_idx, lane_idx);
+                    else
+                        return Mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+                };
+                Mma mma = CreateMMA();
+
+                // Compute threadblock-scoped matrix multiply-add
+                int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+                // Wait for all threads to finish their epilogue phases from the previous tile.
+                __syncthreads();
+
+                // Compute threadblock-scoped matrix multiply-add
+                ElementScale* weight_scale_ptr = params.weight_scales + problem_idx * problem_size.n();
+
+                if constexpr (use_dq_gemm<Mma>::value)
+                {
+                    const MatrixCoord scale_extent = {1, problem_size.n()};
+                    typename Mma::IteratorScale iterator_scale(Mma::IteratorScale::Layout(scale_extent.column()),
+                        weight_scale_ptr, scale_extent, thread_idx, tb_offset_scale);
+
+                    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_scale, accumulators);
+                }
+                else
+                {
+                    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+                }
+
+                //
+                // Epilogue
+                //
+
+                EpilogueOutputOp output_op(params.output_op);
+
+                ElementC* ptr_C = reinterpret_cast<ElementC*>(params.ptr_C) + problem_idx * gemm_n;
+                ElementC* ptr_D = reinterpret_cast<ElementC*>(params.ptr_D) + rows_to_jump * gemm_n;
+
+                LayoutC layout_C(0);
+                LayoutC layout_D(gemm_n);
+
+                typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+                typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+                // Tile iterator loading from source tensor.
+                typename Epilogue::OutputTileIterator iterator_C(
+                    params_C, ptr_C, problem_size.mn(), thread_idx, threadblock_offset.mn());
+
+                // Tile iterator writing to destination tensor.
+                typename Epilogue::OutputTileIterator iterator_D(
+                    params_D, ptr_D, problem_size.mn(), thread_idx, threadblock_offset.mn());
+
+                Epilogue epilogue(shared_storage.epilogue, thread_idx, warp_idx, lane_idx);
+
+                // Execute the epilogue operator to update the destination tensor.
+                epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+                // Next tile
+                problem_visitor.advance(gridDim.x);
+            }
+        }
+    };
+
+    /*
+      To improve compilation speed, we do not compile the device operator if the CUDA_ARCH does not correspond
+      to the ArchTag of the cutlass kernel operator.
+    */
+    /// Executes one GEMM
+    CUTLASS_DEVICE
+    void operator()(Params const& params, SharedStorage& shared_storage)
+    {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) && (__CUDA_ARCH__ < 750)
+        static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm70>::value;
+        KernelRunner<compile_needed>::run_kernel(params, shared_storage);
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) && (__CUDA_ARCH__ < 800)
+        static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm75>::value;
+        KernelRunner<compile_needed>::run_kernel(params, shared_storage);
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ < 900)
+        static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm80>::value;
+        KernelRunner<compile_needed>::run_kernel(params, shared_storage);
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+        // TODO Update the arch to Sm90 once CUTLASS hopper specialisations are available
+        static constexpr bool compile_needed = platform::is_same<KernelArch, arch::Sm80>::value;
+        KernelRunner<compile_needed>::run_kernel(params, shared_storage);
+#else
+        CUTLASS_NOT_IMPLEMENTED();
+#endif
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/kernel/moe_problem_visitor.h b/csrc/cutlass_extensions/gemm/kernel/moe_problem_visitor.h
new file mode 100644
index 000000000000..cd9270d1414d
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/kernel/moe_problem_visitor.h
@@ -0,0 +1,344 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*! \file
+    \brief Base scheduler for grouped problems, using MoE
+*/
+
+#pragma once
+
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace kernel
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ProblemSizeHelper, typename ThreadblockShape_>
+struct BaseMoeProblemVisitor
+{
+    using ThreadblockShape = ThreadblockShape_;
+
+    struct ProblemInfo
+    {
+        static int32_t const kNoPrefetchEntry = -1;
+        int32_t problem_idx;
+        int32_t problem_start;
+
+        CUTLASS_DEVICE
+        ProblemInfo()
+            : problem_idx(kNoPrefetchEntry)
+            , problem_start(kNoPrefetchEntry)
+        {
+        }
+
+        CUTLASS_DEVICE
+        ProblemInfo(int32_t problem_idx_, int32_t problem_start_)
+            : problem_idx(problem_idx_)
+            , problem_start(problem_start_)
+        {
+        }
+    };
+
+    struct Params
+    {
+        int64_t const* last_row_for_problem;
+        int64_t gemm_n;
+        int64_t gemm_k;
+        int32_t problem_count;
+        void const* workspace;
+        int32_t tile_count;
+
+        //
+        // Methods
+        //
+
+        /// Ctor
+        CUTLASS_HOST_DEVICE
+        Params()
+            : last_row_for_problem(nullptr)
+            , gemm_n(0)
+            , gemm_k(0)
+            , problem_count(0)
+            , workspace(nullptr)
+            , tile_count(0)
+        {
+        }
+
+        /// Ctor
+        CUTLASS_HOST_DEVICE
+        Params(int64_t const* last_row_for_problem, int64_t gemm_n, int64_t gemm_k, int32_t problem_count,
+            void const* workspace = nullptr, int32_t tile_count = 0)
+            : last_row_for_problem(last_row_for_problem)
+            , gemm_n(gemm_n)
+            , gemm_k(gemm_k)
+            , problem_count(problem_count)
+            , workspace(workspace)
+            , tile_count(tile_count)
+        {
+        }
+    };
+
+    Params const& params;
+    int32_t tile_idx;
+    int32_t problem_tile_start;
+    int32_t problem_idx;
+
+    //
+    // Methods
+    //
+    CUTLASS_DEVICE
+    BaseMoeProblemVisitor(Params const& params_, int32_t block_idx)
+        : params(params_)
+        , tile_idx(block_idx)
+        , problem_tile_start(0)
+        , problem_idx(0)
+    {
+    }
+
+    /// Get the grid shape
+    CUTLASS_HOST_DEVICE
+    static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem)
+    {
+
+        return cutlass::gemm::GemmCoord(((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
+            ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN), 1);
+    }
+
+    /// Gets the global tile index
+    CUTLASS_HOST_DEVICE
+    int32_t tile_index() const
+    {
+        return tile_idx;
+    }
+
+    /// Gets the index of the problem
+    CUTLASS_HOST_DEVICE
+    int32_t problem_index() const
+    {
+        return problem_idx;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t threadblock_idx() const
+    {
+        return tile_idx - problem_tile_start;
+    }
+
+    CUTLASS_DEVICE
+    void advance(int32_t grid_size)
+    {
+        tile_idx += grid_size;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem)
+    {
+        ProblemSizeHelper::possibly_transpose_problem(problem);
+    }
+
+    /// Returns the problem size for the current problem
+    CUTLASS_HOST_DEVICE
+    cutlass::gemm::GemmCoord problem_size() const
+    {
+        return problem_size(problem_idx);
+    }
+
+    CUTLASS_HOST_DEVICE
+    cutlass::gemm::GemmCoord problem_size(int idx) const
+    {
+        const int64_t prev_problem_row = idx == 0 ? 0 : params.last_row_for_problem[idx - 1];
+        const int64_t current_problem_row = params.last_row_for_problem[idx];
+        const int64_t gemm_m = current_problem_row - prev_problem_row;
+        GemmCoord problem(GemmCoord::Index(gemm_m), GemmCoord::Index(params.gemm_n), GemmCoord::Index(params.gemm_k));
+        ProblemSizeHelper::possibly_transpose_problem(problem);
+        return problem;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int32_t tile_count(const cutlass::gemm::GemmCoord& grid)
+    {
+        return ProblemSizeHelper::tile_count(grid);
+    }
+
+    static int32_t group_tile_count(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count)
+    {
+        int32_t total_tiles = 0;
+        for (int32_t i = 0; i < problem_count; ++i)
+        {
+            auto problem = host_problem_sizes_ptr[i];
+            possibly_transpose_problem(problem);
+            auto grid = grid_shape(problem);
+            total_tiles += tile_count(grid);
+        }
+
+        return total_tiles;
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ProblemSizeHelper, typename ThreadblockShape, GroupScheduleMode GroupScheduleMode_,
+    int PrefetchTileCount, int ThreadCount>
+struct MoeProblemVisitor;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// ProblemVisitor that performs all scheduling on device
+//
+template <typename ProblemSizeHelper, typename ThreadblockShape, int PrefetchTileCount, int ThreadCount>
+struct MoeProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode::kDeviceOnly, PrefetchTileCount,
+    ThreadCount> : public BaseMoeProblemVisitor<ProblemSizeHelper, ThreadblockShape>
+{
+    using Base = BaseMoeProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+    using Params = typename Base::Params;
+    static int const kThreadCount = ThreadCount;
+    static bool const kRequiresPrecomputation = false;
+    static int const kThreadsPerWarp = 32;
+
+    struct SharedStorage
+    {
+    };
+
+    // Final tile of the problem loaded by this thread. Each thread will hold
+    // a separate value.
+    int32_t problem_ending_tile;
+
+    SharedStorage& shared_storage;
+
+    //
+    // Methods
+    //
+    CUTLASS_DEVICE
+    MoeProblemVisitor(Params const& params_, SharedStorage& shared_storage_, int32_t block_idx)
+        : Base(params_, block_idx)
+        , problem_ending_tile(0)
+        , shared_storage(shared_storage_)
+    {
+        this->problem_idx = -1 * kThreadsPerWarp;
+        this->problem_tile_start = 0;
+    }
+
+    CUTLASS_DEVICE
+    bool next_tile()
+    {
+        // Check whether the tile to compute is within the range of the current problem.
+        int32_t problem_tile_end = __shfl_sync(0xffffffff, problem_ending_tile, this->problem_idx % kThreadsPerWarp);
+        if (this->tile_idx < problem_tile_end)
+        {
+            return true;
+        }
+
+        // Check whether the tile to compute is within the current group of problems fetched by the warp.
+        // The last tile for this group is the final tile of the problem held by the final thread in the warp.
+        int32_t group_tile_end = __shfl_sync(0xffffffff, problem_ending_tile, kThreadsPerWarp - 1);
+
+        // Keep the starting problem for this group in `problem_idx`. This is done to reduce
+        // register pressure. The starting problem for this group is simply the first problem
+        // in the group most recently fetched by the warp.
+        int32_t& group_problem_start = this->problem_idx;
+        group_problem_start = (this->problem_idx / kThreadsPerWarp) * kThreadsPerWarp;
+
+        // Keep the starting tile for this group in `problem_tile_start`. This is done to reduce
+        // register pressure.
+        int32_t& group_tile_start = this->problem_tile_start;
+
+        // Each thread in the warp processes a separate problem to advance until
+        // reaching a problem whose starting tile is less less than tile_idx.
+        while (group_tile_end <= this->tile_idx)
+        {
+            group_problem_start += kThreadsPerWarp;
+            if (group_problem_start > this->params.problem_count)
+            {
+                return false;
+            }
+
+            // Since `group_tile_start` is a reference to `this->problem_tile_start`, this
+            // also sets `this->problem_tile_start`. The fact that `this->problem_tile_start`
+            // is also set here is used later in `next_tile`.
+            group_tile_start = group_tile_end;
+
+            int lane_idx = threadIdx.x % kThreadsPerWarp;
+            int32_t lane_problem = group_problem_start + lane_idx;
+
+            // Compute the number of tiles in the problem assigned to each thread.
+            problem_ending_tile = 0;
+            if (lane_problem < this->params.problem_count)
+            {
+                cutlass::gemm::GemmCoord problem = this->problem_size(lane_problem);
+                cutlass::gemm::GemmCoord grid = this->grid_shape(problem);
+                problem_ending_tile = this->tile_count(grid);
+            }
+
+            // Compute a warp-wide inclusive prefix sum to compute the ending tile index of
+            // each thread's problem.
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 1; i < kThreadsPerWarp; i <<= 1)
+            {
+                int32_t val = __shfl_up_sync(0xffffffff, problem_ending_tile, i);
+                if (lane_idx >= i)
+                {
+                    problem_ending_tile += val;
+                }
+            }
+
+            // The total tile count for this group is now in the final position of the prefix sum
+            int32_t tiles_in_group = __shfl_sync(0xffffffff, problem_ending_tile, kThreadsPerWarp - 1);
+
+            problem_ending_tile += group_tile_start;
+            group_tile_end += tiles_in_group;
+        }
+
+        // The next problem to process is the first one that does not have ending tile position
+        // that is greater than or equal to tile index.
+        int32_t problem_idx_in_group = __popc(__ballot_sync(0xffffffff, problem_ending_tile <= this->tile_idx));
+
+        this->problem_idx = group_problem_start + problem_idx_in_group;
+
+        // The starting tile for this problem is the ending tile of the previous problem. In cases
+        // where `problem_idx_in_group` is the first problem in the group, we do not need to reset
+        // `problem_tile_start`, because it is set to the previous group's ending tile in the while
+        // loop above.
+        if (problem_idx_in_group > 0)
+        {
+            this->problem_tile_start = __shfl_sync(0xffffffff, problem_ending_tile, problem_idx_in_group - 1);
+        }
+
+        return true;
+    }
+
+    static size_t get_workspace_size(
+        const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count, int32_t block_count)
+    {
+        return 0;
+    }
+
+    static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count,
+        int32_t block_count, void* host_workspace_ptr)
+    {
+    }
+};
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/gemm/threadblock/default_dq_mma.h b/csrc/cutlass_extensions/gemm/threadblock/default_dq_mma.h
new file mode 100644
index 000000000000..a10ed85a8b0b
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/threadblock/default_dq_mma.h
@@ -0,0 +1,125 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass_extensions/arch/mma.h"
+#include "cutlass_extensions/interleaved_numeric_conversion.h"
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace threadblock
+{
+////////////////////////////////////////////////////////////////////////////////
+
+// We need to distinguish here, since we want volta support. It is too much effort
+// to write shared memory iterators that are probably needed for volta to function
+// properly. As a result, we allow converters both after the LDG (for volta) and after
+// the LDS for Turing+.
+template <
+    /// Iterator for B matrix in global memory
+    typename IteratorB,
+    /// Warp level Mma
+    typename MmaOperator,
+    /// Math operation perform by warp level operator
+    typename MathOperator>
+struct SetConverters
+{
+};
+
+// Dequantize after LDG, so set transforms accordingly
+template <
+    /// Iterator for B matrix in global memory
+    typename IteratorB,
+    /// Mma Policy
+    typename MmaOperator>
+struct SetConverters<IteratorB, MmaOperator, arch::OpMultiplyAdd>
+{
+    using TransformAfterLDG
+        = FastInterleavedAndBiasedNumericArrayConverter<typename MmaOperator::ArchMmaOperator::ElementB,
+            typename IteratorB::Element, IteratorB::Fragment::kElements>;
+
+    using TransformAfterLDS = NumericArrayConverter<typename MmaOperator::ArchMmaOperator::ElementB,
+        typename MmaOperator::ArchMmaOperator::ElementB, MmaOperator::FragmentB::kElements>;
+};
+
+// Dequantize after LDS, so set transforms accordingly
+
+template <
+    /// Iterator for B matrix in global memory
+    typename IteratorB,
+    /// Mma Policy
+    typename MmaOperator>
+struct SetConverters<IteratorB, MmaOperator, arch::OpMultiplyAddDequantizeInterleavedBToA>
+{
+    using TransformAfterLDG = NumericArrayConverter<typename IteratorB::Element, typename IteratorB::Element,
+        IteratorB::Fragment::kElements>;
+
+    using TransformAfterLDS
+        = FastInterleavedAndBiasedNumericArrayConverter<typename MmaOperator::ArchMmaOperator::ElementB,
+            typename TransformAfterLDG::result_type::Element, MmaOperator::FragmentB::kElements>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale_,
+    /// Layout for the scale operand
+    typename LayoutScale_,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    ///
+    typename Enable = void>
+struct DqMma;
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h b/csrc/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h
new file mode 100644
index 000000000000..bd4c16ee0194
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h
@@ -0,0 +1,297 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass_extensions/arch/mma.h"
+
+#include "cutlass_extensions/gemm/threadblock/dq_mma_multistage.h"
+#include "cutlass_extensions/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"
+#include "cutlass_extensions/tile_interleaved_layout.h"
+
+#include "cutlass_extensions/gemm/threadblock/default_dq_mma.h"
+#include "cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h"
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace threadblock
+{
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename MmaShape, typename Element, typename Layout, WeightOnlyQuantOp QuantOp, int Alignment,
+    typename Enable = void>
+struct DefaultScaleIterators;
+
+// Fine grained iterators
+template <typename MmaShape, typename Element, typename Layout, WeightOnlyQuantOp QuantOp, int Alignment>
+struct DefaultScaleIterators<MmaShape, Element, Layout, QuantOp, Alignment, std::enable_if_t<isFinegrained(QuantOp)>>
+{
+    using IteratorScale
+        = cutlass::transform::threadblock::FineGrainedScaleZeroIterator<cutlass::MatrixShape<1, MmaShape::kN>, Element,
+            Layout, 0, Alignment>;
+
+    using SmemIteratorScale = IteratorScale;
+};
+
+// Per column iterators
+template <typename MmaShape, typename Element, typename Layout, WeightOnlyQuantOp QuantOp, int Alignment>
+struct DefaultScaleIterators<MmaShape, Element, Layout, QuantOp, Alignment, std::enable_if_t<!isFinegrained(QuantOp)>>
+{
+    // ThreadMap for scale iterator
+    static_assert((MmaShape::kN % Alignment) == 0, "");
+
+private:
+    using IteratorScaleThreadMap = transform::PitchLinearStripminedThreadMap<layout::PitchLinearShape<MmaShape::kN, 1>,
+        MmaShape::kN / Alignment, Alignment>;
+
+public:
+    // Define iterators over tiles from the scale operand
+    using IteratorScale = cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaShape::kN>,
+        Element, Layout, 0, IteratorScaleThreadMap, Alignment>;
+
+    using SmemIteratorScale = IteratorScale;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Type for elementA
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Stages in GEMM
+    int kStages,
+    ///
+    typename Operator_,
+    ///
+    SharedMemoryClearOption SharedMemoryClear>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+    ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+    kStages, Operator_, SharedMemoryClear,
+    typename platform::enable_if<(
+        ArchTag::kMinComputeCapability >= 80 && !layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type>
+{
+
+    static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+        "Element A must be fp16 or bf16");
+
+    using OperatorInfo = arch::DetagOperator<Operator_>;
+    using Operator = typename OperatorInfo::Operator;
+    static_assert(platform::is_same<Operator, arch::OpMultiplyAddDequantizeInterleavedBToA>::value,
+        "Mma multistage must dequantize after ldsm");
+
+    static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+        "Element B must be uint8 or uint4");
+
+    static cutlass::arch::CacheOperation::Kind const CacheOpA = ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+        ? cutlass::arch::CacheOperation::Global
+        : cutlass::arch::CacheOperation::Always;
+
+    static cutlass::arch::CacheOperation::Kind const CacheOpB = ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+        ? cutlass::arch::CacheOperation::Global
+        : cutlass::arch::CacheOperation::Always;
+
+    // Define the MmaCore components
+    // Mma core does not depend on stages, so pass in at least 3 here to mma multistage pieces are created
+    using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape,
+        ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass, std::max(kStages, 3),
+        Operator, false, CacheOpA, CacheOpB>;
+
+    // Define iterators over tiles from the A operand
+    using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+    using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+    using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, ElementA, LayoutA, 1, ThreadMapA,
+        AccessTypeA>;
+
+    // Define iterators over tiles from the B operand
+    using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+    using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+    using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, ElementB, LayoutB, 0, ThreadMapB,
+        AccessTypeB>;
+
+    using ScaleIterators = DefaultScaleIterators<typename MmaCore::Shape, ElementScale, LayoutScale,
+        OperatorInfo::QuantOp, kAlignmentScale>;
+
+    // Define iterators over tiles from the scale operand
+    using IteratorScale = typename ScaleIterators::IteratorScale;
+
+    using SmemIteratorScale = typename ScaleIterators::SmemIteratorScale;
+
+    using Converter = FastInterleavedAndBiasedNumericArrayConverter<ElementA, ElementB,
+        MmaCore::MmaPolicy::Operator::FragmentB::kElements>;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::DqMmaMultistage<typename MmaCore::Shape, IteratorA,
+        typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+        MmaCore::kCacheOpB, IteratorScale, SmemIteratorScale, ElementAccumulator, layout::RowMajor,
+        typename MmaCore::MmaPolicy, kStages, Converter, OperatorInfo::QuantOp, SharedMemoryClear>;
+};
+
+template <
+    /// Type for element A
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Stages in GEMM
+    int kStages,
+    ///
+    typename Operator_,
+    ///
+    SharedMemoryClearOption SharedMemoryClear>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+    ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+    kStages, Operator_, SharedMemoryClear,
+    typename platform::enable_if<(
+        ArchTag::kMinComputeCapability >= 80 && layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type>
+{
+
+    static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+        "Element A must be fp16 or bf16");
+
+    using OperatorInfo = arch::DetagOperator<Operator_>;
+    using Operator = typename OperatorInfo::Operator;
+    static_assert(platform::is_same<Operator, arch::OpMultiplyAddDequantizeInterleavedBToA>::value,
+        "Mma multistage must dequantize after ldsm");
+
+    static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+        "Element B must be uint8 or uint4");
+
+    static cutlass::arch::CacheOperation::Kind const CacheOpA = ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+        ? cutlass::arch::CacheOperation::Global
+        : cutlass::arch::CacheOperation::Always;
+
+    static cutlass::arch::CacheOperation::Kind const CacheOpB = ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+        ? cutlass::arch::CacheOperation::Global
+        : cutlass::arch::CacheOperation::Always;
+
+    // Define the MmaCore components
+    // Mma core does not depend on stages, so pass in at least 3 here to mma multistage pieces are created
+    using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape,
+        ElementA, LayoutA, ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+        std::max(kStages, 3), Operator, false, CacheOpA, CacheOpB>;
+
+    // Define iterators over tiles from the A operand
+    using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+    using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+    using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, ElementA, LayoutA, 1, ThreadMapA,
+        AccessTypeA>;
+
+private:
+    static constexpr int ColumnsInterleaved = LayoutB::kColumnsInterleaved;
+    static constexpr int RowsPerTile = LayoutB::kRowsPerTile;
+    static_assert(!(MmaCore::Shape::kN % ColumnsInterleaved), "");
+    static_assert(RowsPerTile == MmaCore::Shape::kK, "");
+
+    using OriginalThreadMap = typename MmaCore::IteratorThreadMapB;
+    using OriginalWarpArrangement = typename OriginalThreadMap::Detail::WarpThreadArrangement;
+    static_assert(!(OriginalWarpArrangement::kStrided % ColumnsInterleaved), "");
+
+    using GmemIteratorShape
+        = MatrixShape<MmaCore::Shape::kK * ColumnsInterleaved, MmaCore::Shape::kN / ColumnsInterleaved>;
+    using GmemThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+        layout::PitchLinearShape<GmemIteratorShape::kRow, GmemIteratorShape::kColumn>, OriginalThreadMap::kThreads,
+        layout::PitchLinearShape<OriginalWarpArrangement::kContiguous * ColumnsInterleaved,
+            OriginalWarpArrangement::kStrided / ColumnsInterleaved>,
+        MmaCore::kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+public:
+    // Define iterators over tiles from the B operand
+    using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+    using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+    using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<GmemIteratorShape, ElementB,
+        layout::ColumnMajor, 0, GmemThreadMapB, AccessTypeB>;
+
+    using ScaleIterators = DefaultScaleIterators<typename MmaCore::Shape, ElementScale, LayoutScale,
+        OperatorInfo::QuantOp, kAlignmentScale>;
+
+    // Define iterators over tiles from the scale operand
+    using IteratorScale = typename ScaleIterators::IteratorScale;
+
+    using SmemIteratorScale = typename ScaleIterators::SmemIteratorScale;
+
+    using Converter = FastInterleavedAndBiasedNumericArrayConverter<ElementA, ElementB,
+        MmaCore::MmaPolicy::Operator::FragmentB::kElements>;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::DqMmaMultistage<typename MmaCore::Shape, IteratorA,
+        typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+        MmaCore::kCacheOpB, IteratorScale, SmemIteratorScale, ElementAccumulator, layout::RowMajor,
+        typename MmaCore::MmaPolicy, kStages, Converter, OperatorInfo::QuantOp, SharedMemoryClear>;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h b/csrc/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h
new file mode 100644
index 000000000000..f94e1950e589
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h
@@ -0,0 +1,249 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass_extensions/arch/mma.h"
+
+#include "cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h"
+#include "cutlass_extensions/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"
+#include "cutlass_extensions/tile_interleaved_layout.h"
+
+#include "cutlass_extensions/gemm/threadblock/default_dq_mma.h"
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace threadblock
+{
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Type for element A
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+    ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+    Operator_, SharedMemoryClearOption::kNone,
+    typename platform::enable_if<(
+        ArchTag::kMinComputeCapability < 80 && !layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type>
+{
+
+    static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+        "Element A must be fp16 or bf16");
+
+    static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+        "Element B must be uint8 or uint4");
+
+    using OperatorInfo = arch::DetagOperator<Operator_>;
+    using Operator = typename OperatorInfo::Operator;
+    static_assert(OperatorInfo::QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+    static constexpr bool DqAfterLDG = platform::is_same<arch::OpMultiplyAdd, Operator>::value;
+    static constexpr bool arch_has_bf16_mma = ArchTag::kMinComputeCapability >= 80;
+    using MmaCoreElementA = typename platform::conditional<arch_has_bf16_mma, ElementA, half_t>::type;
+    using MmaCoreElementB = typename platform::conditional<DqAfterLDG, MmaCoreElementA, ElementB>::type;
+
+    // Define the MmaCore components
+    using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape,
+        MmaCoreElementA, LayoutA, MmaCoreElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass, 2,
+        Operator>;
+
+    // Define iterators over tiles from the A operand
+    using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+        cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA, LayoutA, 1,
+        typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+    // Define iterators over tiles from the B operand
+    using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
+        cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, ElementB, LayoutB, 0,
+        typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+    // ThreadMap for scale iterator
+    static_assert((MmaCore::Shape::kN % kAlignmentScale) == 0, "");
+    using IteratorScaleThreadMap
+        = transform::PitchLinearStripminedThreadMap<layout::PitchLinearShape<MmaCore::Shape::kN, 1>,
+            MmaCore::Shape::kN / kAlignmentScale, kAlignmentScale>;
+
+    // Define iterators over tiles from the scale operand
+    using IteratorScale
+        = cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>,
+            ElementScale, LayoutScale, 0, IteratorScaleThreadMap, kAlignmentScale>;
+
+    using SmemScaleType = typename platform::conditional<arch_has_bf16_mma, ElementScale, half_t>::type;
+    using SmemIteratorScale
+        = cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>,
+            SmemScaleType, LayoutScale, 0, IteratorScaleThreadMap, kAlignmentScale>;
+
+    using Converters = SetConverters<IteratorB, typename MmaCore::MmaPolicy::Operator, Operator>;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::DqMmaPipelined<typename MmaCore::Shape, IteratorA,
+        typename MmaCore::SmemIteratorA, IteratorB, typename MmaCore::SmemIteratorB, IteratorScale, SmemIteratorScale,
+        ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy, typename Converters::TransformAfterLDG,
+        typename Converters::TransformAfterLDS, OperatorInfo::QuantOp>;
+};
+
+// Specialization to handle column major interleave B
+template <
+    /// Type for element A
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for the input scale
+    typename ElementScale,
+    /// Layout for the scale operand
+    typename LayoutScale,
+    /// Access granularity of Scales in unit of elements
+    int kAlignmentScale,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DqMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementScale, LayoutScale, kAlignmentScale,
+    ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+    Operator_, SharedMemoryClearOption::kNone,
+    typename platform::enable_if<(
+        ArchTag::kMinComputeCapability < 80 && layout::IsColumnMajorTileInterleave<LayoutB>::value)>::type>
+{
+
+    static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+        "Element A must be fp16 or bf16");
+
+    static_assert(platform::is_same<ElementB, uint8_t>::value || platform::is_same<ElementB, uint4b_t>::value,
+        "Element B must be uint8 or uint4");
+
+    using OperatorInfo = arch::DetagOperator<Operator_>;
+    using Operator = typename OperatorInfo::Operator;
+    static_assert(OperatorInfo::QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+    static constexpr bool DqAfterLDG = platform::is_same<arch::OpMultiplyAdd, Operator>::value;
+    static constexpr bool arch_has_bf16_mma = ArchTag::kMinComputeCapability >= 80;
+    using MmaCoreElementA = typename platform::conditional<arch_has_bf16_mma, ElementA, half_t>::type;
+    using MmaCoreElementB = typename platform::conditional<DqAfterLDG, MmaCoreElementA, ElementB>::type;
+
+    // Define the MmaCore components
+    using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape,
+        MmaCoreElementA, LayoutA, MmaCoreElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor,
+        OperatorClass, 2, Operator>;
+
+    // Define iterators over tiles from the A operand
+    using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+        cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA, LayoutA, 1,
+        typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+private:
+    static constexpr int ColumnsInterleaved = LayoutB::kColumnsInterleaved;
+    static constexpr int RowsPerTile = LayoutB::kRowsPerTile;
+    static_assert(!(MmaCore::Shape::kN % ColumnsInterleaved), "");
+    static_assert(RowsPerTile == MmaCore::Shape::kK, "");
+
+    using OriginalThreadMap = typename MmaCore::IteratorThreadMapB;
+    using OriginalWarpArrangement = typename OriginalThreadMap::Detail::WarpThreadArrangement;
+    static_assert(!(OriginalWarpArrangement::kStrided % ColumnsInterleaved), "");
+
+    using GmemIteratorShape
+        = MatrixShape<MmaCore::Shape::kK * ColumnsInterleaved, MmaCore::Shape::kN / ColumnsInterleaved>;
+    using GmemThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+        layout::PitchLinearShape<GmemIteratorShape::kRow, GmemIteratorShape::kColumn>, OriginalThreadMap::kThreads,
+        layout::PitchLinearShape<OriginalWarpArrangement::kContiguous * ColumnsInterleaved,
+            OriginalWarpArrangement::kStrided / ColumnsInterleaved>,
+        MmaCore::kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+public:
+    // Define iterators over tiles from the B operand
+    using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<GmemIteratorShape, ElementB,
+        layout::ColumnMajor, 0, GmemThreadMapB, kAlignmentB>;
+
+    // ThreadMap for scale iterator
+    static_assert((MmaCore::Shape::kN % kAlignmentScale) == 0, "");
+    using IteratorScaleThreadMap
+        = transform::PitchLinearStripminedThreadMap<layout::PitchLinearShape<MmaCore::Shape::kN, 1>,
+            MmaCore::Shape::kN / kAlignmentScale, kAlignmentScale>;
+
+    // Define iterators over tiles from the scale operand
+    using IteratorScale
+        = cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>,
+            ElementScale, LayoutScale, 0, IteratorScaleThreadMap, kAlignmentScale>;
+
+    using SmemScaleType = typename platform::conditional<arch_has_bf16_mma, ElementScale, half_t>::type;
+    using SmemIteratorScale
+        = cutlass::transform::threadblock::PredicatedTileIterator<cutlass::MatrixShape<1, MmaCore::Shape::kN>,
+            SmemScaleType, LayoutScale, 0, IteratorScaleThreadMap, kAlignmentScale>;
+
+    using Converters = SetConverters<IteratorB, typename MmaCore::MmaPolicy::Operator, Operator>;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::DqMmaPipelined<typename MmaCore::Shape, IteratorA,
+        typename MmaCore::SmemIteratorA, IteratorB, typename MmaCore::SmemIteratorB, IteratorScale, SmemIteratorScale,
+        ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy, typename Converters::TransformAfterLDG,
+        typename Converters::TransformAfterLDS, OperatorInfo::QuantOp>;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/gemm/threadblock/default_mma.h b/csrc/cutlass_extensions/gemm/threadblock/default_mma.h
new file mode 100644
index 000000000000..8f5cb8a71b9c
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/threadblock/default_mma.h
@@ -0,0 +1,290 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h"
+#include "cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h"
+#include "cutlass_extensions/gemm/threadblock/default_mma_bf16.h"
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace threadblock
+{
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int8 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2, Operator>
+{
+
+private:
+    static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+    using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+        kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, 2, Operator>;
+
+public:
+    // Define the MmaCore components
+    using MmaCore = typename Mma::MmaCore;
+
+    // Define iterators over tiles from the A operand
+    using IteratorA = typename Mma::IteratorA;
+
+    // Define iterators over tiles from the B operand
+    using IteratorB = typename Mma::IteratorB;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2, Operator>
+{
+
+private:
+    static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+    using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+        kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, 2, Operator>;
+
+public:
+    // Define the MmaCore components
+    using MmaCore = typename Mma::MmaCore;
+
+    // Define iterators over tiles from the A operand
+    using IteratorA = typename Mma::IteratorA;
+
+    // Define iterators over tiles from the B operand
+    using IteratorB = typename Mma::IteratorB;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, kStages, Operator,
+    false, SharedMemoryClear>
+{
+
+private:
+    static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+    using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+        kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+public:
+    // Define the MmaCore components
+    using MmaCore = typename Mma::MmaCore;
+
+    // Define iterators over tiles from the A operand
+    using IteratorA = typename Mma::IteratorA;
+
+    // Define iterators over tiles from the B operand
+    using IteratorB = typename Mma::IteratorB;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, kStages, Operator,
+    false, SharedMemoryClear>
+{
+
+private:
+    static constexpr int kAlignmentScale = 128 / sizeof_bits<half_t>::value;
+
+    using Mma = DqMma<half_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, half_t, layout::RowMajor,
+        kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+public:
+    // Define the MmaCore components
+    using MmaCore = typename Mma::MmaCore;
+
+    // Define iterators over tiles from the A operand
+    using IteratorA = typename Mma::IteratorA;
+
+    // Define iterators over tiles from the B operand
+    using IteratorB = typename Mma::IteratorB;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+// fp16 x fp16 specialization on Ampere to use mma multistage for 2 stage. Helps avoid reg spills on
+// large tile when not enough shared mem is present to do 3+ stage
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB>
+struct DefaultMma<half_t, LayoutA, kAlignmentA, half_t, LayoutB, kAlignmentB, ElementAccumulator, layout::RowMajor,
+    arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 2, Operator, false,
+    SharedMemoryClear, GatherA, GatherB>
+{
+
+    // Define the MmaCore components
+    // 3 is used on purpose here to trigger components for mma multistage
+    using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape,
+        half_t, LayoutA, half_t, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 3, Operator>;
+
+    // Define iterators over tiles from the A operand
+    using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+    using AccessTypeA = cutlass::Array<half_t, kAlignmentA>;
+    using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, half_t, LayoutA, 1, ThreadMapA, AccessTypeA,
+        GatherA>;
+
+    // Define iterators over tiles from the B operand
+    using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+    using AccessTypeB = cutlass::Array<half_t, kAlignmentB>;
+    using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, half_t, LayoutB, 0, ThreadMapB, AccessTypeB,
+        GatherB>;
+
+    // Define the threadblock-scoped multistage matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<typename MmaCore::Shape, IteratorA,
+        typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+        MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy, 2>;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/gemm/threadblock/default_mma_bf16.h b/csrc/cutlass_extensions/gemm/threadblock/default_mma_bf16.h
new file mode 100644
index 000000000000..0a952900cd79
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/threadblock/default_mma_bf16.h
@@ -0,0 +1,353 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h"
+#include "cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h"
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace threadblock
+{
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp), bf16 activation & bf16 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB>
+struct DefaultMma<bfloat16_t, LayoutA, kAlignmentA, bfloat16_t, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2, Operator, false,
+    SharedMemoryClear, GatherA, GatherB>
+{
+
+private:
+    // Conversions only needed pre-ampere. This will trigger mma pipeline, so we convert before STS.
+    static constexpr bool arch_has_bf16_mma = ArchTag::kMinComputeCapability >= 80;
+    using MmaElementA = typename platform::conditional<arch_has_bf16_mma, bfloat16_t, half_t>::type;
+    using MmaElementB = typename platform::conditional<arch_has_bf16_mma, bfloat16_t, half_t>::type;
+
+public:
+    // Define the MmaCore components
+    using MmaCore =
+        typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, MmaElementA,
+            LayoutA, MmaElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 2, Operator>;
+
+    using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+        cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, bfloat16_t, LayoutA, 1,
+        typename MmaCore::IteratorThreadMapA, kAlignmentA, GatherA>;
+
+    // Define iterators over tiles from the B operand
+    using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
+        cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, bfloat16_t, LayoutB, 0,
+        typename MmaCore::IteratorThreadMapB, kAlignmentB, GatherB>;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<typename MmaCore::Shape, IteratorA,
+        typename MmaCore::SmemIteratorA, IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+        layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+// bf16 x bf16 specialization on Ampere to use mma multistage for 2 stage. Helps avoid reg spills on
+// large tile when not enough shared mem is present to do 3+ stage
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB>
+struct DefaultMma<bfloat16_t, LayoutA, kAlignmentA, bfloat16_t, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 2, Operator,
+    false, SharedMemoryClear, GatherA, GatherB>
+{
+
+    // Define the MmaCore components
+    // 3 is used on purpose here to trigger components for mma multistage
+    using MmaCore =
+        typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, bfloat16_t,
+            LayoutA, bfloat16_t, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 3, Operator>;
+
+    // Define iterators over tiles from the A operand
+    using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+    using AccessTypeA = cutlass::Array<bfloat16_t, kAlignmentA>;
+    using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, bfloat16_t, LayoutA, 1, ThreadMapA,
+        AccessTypeA, GatherA>;
+
+    // Define iterators over tiles from the B operand
+    using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+    using AccessTypeB = cutlass::Array<bfloat16_t, kAlignmentB>;
+    using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, bfloat16_t, LayoutB, 0, ThreadMapB,
+        AccessTypeB, GatherB>;
+
+    // Define the threadblock-scoped multistage matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<typename MmaCore::Shape, IteratorA,
+        typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+        MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy, 2>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp), bf16 activation & int8 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2, Operator>
+{
+
+private:
+    static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+    using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+        kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, 2, Operator>;
+
+public:
+    // Define the MmaCore components
+    using MmaCore = typename Mma::MmaCore;
+
+    // Define iterators over tiles from the A operand
+    using IteratorA = typename Mma::IteratorA;
+
+    // Define iterators over tiles from the B operand
+    using IteratorB = typename Mma::IteratorB;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), bf16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2, Operator>
+{
+
+private:
+    static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+    using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+        kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, 2, Operator>;
+
+public:
+    // Define the MmaCore components
+    using MmaCore = typename Mma::MmaCore;
+
+    // Define iterators over tiles from the A operand
+    using IteratorA = typename Mma::IteratorA;
+
+    // Define iterators over tiles from the B operand
+    using IteratorB = typename Mma::IteratorB;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, kStages, Operator,
+    false, SharedMemoryClear>
+{
+
+private:
+    static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+    using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint8_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+        kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+public:
+    // Define the MmaCore components
+    using MmaCore = typename Mma::MmaCore;
+
+    // Define iterators over tiles from the A operand
+    using IteratorA = typename Mma::IteratorA;
+
+    // Define iterators over tiles from the B operand
+    using IteratorB = typename Mma::IteratorB;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp), fp16 activation & int4 weight
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    ///
+    int kStages,
+    /// Shared memory clear option
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, kStages, Operator,
+    false, SharedMemoryClear>
+{
+
+private:
+    static constexpr int kAlignmentScale = 128 / sizeof_bits<bfloat16_t>::value;
+
+    using Mma = DqMma<bfloat16_t, LayoutA, kAlignmentA, uint4b_t, LayoutB, kAlignmentB, bfloat16_t, layout::RowMajor,
+        kAlignmentScale, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;
+
+public:
+    // Define the MmaCore components
+    using MmaCore = typename Mma::MmaCore;
+
+    // Define iterators over tiles from the A operand
+    using IteratorA = typename Mma::IteratorA;
+
+    // Define iterators over tiles from the B operand
+    using IteratorB = typename Mma::IteratorB;
+
+    // Define the threadblock-scoped pipelined matrix multiply
+    using ThreadblockMma = typename Mma::ThreadblockMma;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/gemm/threadblock/dq_mma_base.h b/csrc/cutlass_extensions/gemm/threadblock/dq_mma_base.h
new file mode 100644
index 000000000000..dff66be7593f
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/threadblock/dq_mma_base.h
@@ -0,0 +1,257 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass_extensions/weight_only_quant_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace threadblock
+{
+
+////////////////////////////////////////////////////////////////////////////////
+// SFINAE trick so I can keep the same loop code for Volta and dispatch to the
+// correct warp level mma. On volta, all data is stored to shared memory as FP16.
+template <typename WarpMma, int kExpansionFactor = 1>
+CUTLASS_DEVICE void run_warp_mma(WarpMma& warp_mma, typename WarpMma::FragmentC& D,
+    typename WarpMma::FragmentA const& A, typename WarpMma::FragmentB const& B, typename WarpMma::FragmentC const& C,
+    const int warp_tileB_k_offset)
+{
+    warp_mma(D, A, B, C);
+}
+
+template <typename WarpMma, int kExpansionFactor = WarpMma::kExpansionFactor>
+CUTLASS_DEVICE void run_warp_mma(WarpMma& warp_mma, typename WarpMma::FragmentC& D,
+    typename WarpMma::TransformedFragmentA const& A, typename WarpMma::TransformedFragmentB const& B,
+    typename WarpMma::FragmentC const& C, const int warp_tileB_k_offset)
+{
+    warp_mma(D, A, B, C, warp_tileB_k_offset);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// The type of the scales
+    typename ElementScale_,
+    /// Number of stages,
+    int Stages,
+    /// The dequantizing op to be performed.
+    WeightOnlyQuantOp DequantOp,
+    /// Used for partial specialization,
+    typename Enable = bool>
+class DqMmaBase
+{
+public:
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape = Shape_;
+
+    ///< Policy describing tuning details
+    using Policy = Policy_;
+
+    ///< Type of the scale to be loaded
+    using ElementScale = ElementScale_;
+
+    static_assert(DequantOp != WeightOnlyQuantOp::UNDEFINED, "");
+
+    // Finegrained scales get streamed in via cp.async
+    static constexpr int ScalebiasStages = isFinegrained(DequantOp) ? Stages : 1;
+    // We always have scales.
+    static constexpr int ScaleElementsPerStage = Shape::kN;
+    // We sometimes have a bias
+    static constexpr int BiasElementsPerStage = hasZero(DequantOp) ? Shape::kN : 0;
+
+    //
+    // Dependent types
+    //
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Shape describing the overall GEMM computed from shared memory
+    /// by each warp.
+    using WarpGemm = typename Policy::Operator::Shape;
+
+    /// Shape describing the number of warps filling the CTA
+    using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
+
+    /// Number of warp-level GEMM operations
+    static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+    static constexpr int kNumKIterationsPerWarpBLoad
+        = Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
+
+    static_assert(!(kWarpGemmIterations % kNumKIterationsPerWarpBLoad), "");
+    static constexpr int kWarpGemmIterationsForB = kWarpGemmIterations / kNumKIterationsPerWarpBLoad;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Tensor reference to the A operand
+    using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+    /// Tensor reference to the B operand
+    using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+    //
+    // Nested structs
+    //
+
+    /// Shared storage object needed by threadblock-scoped GEMM
+    class SharedStorage
+    {
+    public:
+        //
+        // Type definitions
+        //
+
+        /// Shape of the A matrix operand in shared memory
+        using ShapeA
+            = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow, Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+        /// Shape of the B matrix operand in shared memory
+        using ShapeB
+            = MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow, Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+        /// Shape of the shared memory buffer for the scales for the B matrix.
+        using ShapeScale = MatrixShape<ScalebiasStages, ScaleElementsPerStage>;
+        /// Shape of the shared memory buffer for the biases of the B matrix.
+        using ShapeZero = MatrixShape<ScalebiasStages, BiasElementsPerStage>;
+
+    public:
+        //
+        // Data members
+        //
+
+        /// Buffer for A operand
+        AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+        /// Buffer for B operand
+        AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+        /// Buffer to hold scales for threadblock
+        AlignedBuffer<ElementScale, ShapeScale::kCount> operand_scale;
+
+        /// Buffer to hold scales for threadblock
+        AlignedBuffer<ElementScale, ShapeZero::kCount> operand_zero;
+
+    public:
+        //
+        // Methods
+        //
+
+        /// Returns a layout object for the A matrix
+        CUTLASS_DEVICE
+        static typename Operator::LayoutA LayoutA()
+        {
+            return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+        }
+
+        /// Returns a layout object for the B matrix
+        CUTLASS_HOST_DEVICE
+        static typename Operator::LayoutB LayoutB()
+        {
+            return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+        }
+
+        /// Returns a TensorRef to the A operand
+        CUTLASS_HOST_DEVICE
+        TensorRefA operand_A_ref()
+        {
+            return TensorRefA{operand_A.data(), LayoutA()};
+        }
+
+        /// Returns a TensorRef to the B operand
+        CUTLASS_HOST_DEVICE
+        TensorRefB operand_B_ref()
+        {
+            return TensorRefB{operand_B.data(), LayoutB()};
+        }
+    };
+
+protected:
+    //
+    // Data members
+    //
+
+    /// Iterator to load a warp-scoped tile of A operand from shared memory
+    typename Operator::IteratorA warp_tile_iterator_A_;
+
+    /// Iterator to load a warp-scoped tile of B operand from shared memory
+    typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    DqMmaBase(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        SharedStorage& shared_storage,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx)
+        , warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx)
+    {
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h b/csrc/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h
new file mode 100644
index 000000000000..3c4036dd8cc5
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/threadblock/dq_mma_multistage.h
@@ -0,0 +1,110 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace threadblock
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = void>
+class DqMmaMultistage;
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+#include "cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h"
+#include "cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h"
diff --git a/csrc/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h b/csrc/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h
new file mode 100644
index 000000000000..76564f14ba2d
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/threadblock/dq_mma_multistage_finegrained.h
@@ -0,0 +1,691 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace threadblock
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+class DqMmaMultistage<Shape_, IteratorA_, SmemIteratorA_, CacheOpA, IteratorB_, SmemIteratorB_, CacheOpB,
+    IteratorScale_, SmemIteratorScale_, ElementC_, LayoutC_, Policy_, Stages, TransformBAfterLDS_, QuantOp_,
+    SharedMemoryClear, std::enable_if_t<isFinegrained(QuantOp_)>>
+    : public DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_>
+{
+public:
+    ///< Base class
+    using Base = DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_>;
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape = Shape_;
+    ///< Iterates over tiles of A operand in global memory
+    using IteratorA = IteratorA_;
+    ///< Iterates over tiles of B operand in global memory
+    using IteratorB = IteratorB_;
+    ///< Data type of accumulator matrix
+    using ElementC = ElementC_;
+    ///< Layout of accumulator matrix
+    using LayoutC = LayoutC_;
+    ///< Policy describing tuning details
+    using Policy = Policy_;
+
+    using IteratorScale = IteratorScale_;
+    using ElementScale = typename IteratorScale::Element;
+    using LayoutScale = typename IteratorScale::Layout;
+
+    using SmemIteratorA = SmemIteratorA_;
+    using SmemIteratorB = SmemIteratorB_;
+    using SmemIteratorScale = SmemIteratorScale_;
+
+    static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+    static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+    using TransformBAfterLDS = TransformBAfterLDS_;
+
+    static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+    //
+    // Dependent types
+    //
+
+    /// Fragment of accumulator tile
+    using FragmentC = typename Policy::Operator::FragmentC;
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Minimum architecture is Sm80 to support cp.async
+    using ArchTag = arch::Sm80;
+
+    using Dequantizer = warp::MmaTensorOpDequantizer<Operator, typename Base::WarpGemm, Operand::kB, ElementScale,
+        LayoutScale, 32, QuantOp>;
+
+    /// Complex transform on A operand
+    static ComplexTransform const kTransformA = Operator::kTransformA;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = Operator::kTransformB;
+
+    static_assert(Base::SharedStorage::ShapeScale::kRow == Stages, "");
+    static_assert(Base::SharedStorage::ShapeScale::kColumn == Shape::kN, "");
+
+    /// Internal structure exposed for introspection.
+    struct Detail
+    {
+
+        static_assert(Base::kWarpGemmIterations > 1,
+            "The pipelined structure requires at least two warp-level "
+            "GEMM operations.");
+
+        /// Number of cp.async instructions to load one stage of operand A
+        static int const AsyncCopyIterationsPerStageA = IteratorA::ThreadMap::Iterations::kCount;
+
+        /// Number of cp.async instructions to load one stage of operand B
+        static int const AsyncCopyIterationsPerStageB = IteratorB::ThreadMap::Iterations::kCount;
+
+        /// Number of stages
+        static int const kStages = Stages;
+
+        /// Number of cp.async instructions to load on group of operand A
+        static int const kAccessesPerGroupA
+            = (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+        /// Number of cp.async instructions to load on group of operand B
+        static int const kAccessesPerGroupB
+            = (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+    };
+
+private:
+    using WarpFragmentA = typename Operator::FragmentA;
+    using WarpFragmentB = typename Operator::FragmentB;
+    Dequantizer warp_dequantizer_;
+
+    using ElementB = typename IteratorB::Element;
+    using LayoutDetailsForB = kernel::LayoutDetailsB<ElementB, ArchTag>;
+
+    static constexpr bool RequiresTileInterleave
+        = layout::IsColumnMajorTileInterleave<typename LayoutDetailsForB::Layout>::value;
+    static_assert(!RequiresTileInterleave || (RequiresTileInterleave && (Shape::kK == LayoutDetailsForB::ThreadblockK)),
+        "Layout K must match threadblockK");
+
+private:
+    //
+    // Data members
+    //
+
+    /// Iterator to write threadblock-scoped tile of A operand to shared memory
+    SmemIteratorA smem_iterator_A_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB smem_iterator_B_;
+
+    /// Iterator to write threadblock-scoped tile of scale and zero operand to shared memory
+    SmemIteratorScale smem_iterator_scale_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    DqMmaMultistage(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorage& shared_storage,
+        /// The group size for quantization
+        int group_size,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx)
+        , warp_dequantizer_({shared_storage.operand_scale.data(), LayoutScale(Shape::kN)},
+              {shared_storage.operand_zero.data(), LayoutScale(Shape::kN)},
+              (warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN)) / Base::WarpCount::kM, lane_idx)
+        , smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx)
+        , smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+        , smem_iterator_scale_(LayoutScale(Shape::kN), shared_storage.operand_scale.data(),
+              shared_storage.operand_zero.data(), {Base::kStages, Shape::kN}, thread_idx, group_size)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterationsForB * warp_idx_k, warp_idx_n});
+    }
+
+    CUTLASS_DEVICE
+    void copy_scales_and_advance(IteratorScale& iterator_scale, int stage = -1, int k_iter = -1)
+    {
+        static_assert(IteratorScale::Shape::kRow == 1, "Scale stride must be 1.");
+
+        typename IteratorScale::AccessType* gmem_scale_ptr = iterator_scale.get_scale();
+        typename IteratorScale::AccessType* gmem_zero_ptr = iterator_scale.get_zero();
+
+        typename IteratorScale::AccessType* smem_scale_ptr
+            = reinterpret_cast<typename IteratorScale::AccessType*>(this->smem_iterator_scale_.get_scale());
+        typename IteratorScale::AccessType* smem_zero_ptr
+            = reinterpret_cast<typename IteratorScale::AccessType*>(this->smem_iterator_scale_.get_zero());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorScale::Element>::value * IteratorScale::kAlignment / 8;
+
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(smem_scale_ptr, gmem_scale_ptr, iterator_scale.valid());
+
+        if (gmem_zero_ptr != nullptr)
+        {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(smem_zero_ptr, gmem_zero_ptr, iterator_scale.valid());
+        }
+
+        if (iterator_scale.group_size_ == 64)
+        {
+            iterator_scale.add_tile_offset({1, 0});
+        }
+        else if (iterator_scale.group_size_ == 128)
+        {
+            if (iterator_scale.row_groupsize64_ & 0x1)
+            {
+                iterator_scale.add_tile_offset({1, 0});
+            }
+        }
+
+        iterator_scale.row_groupsize64_++;
+
+        this->smem_iterator_scale_.add_tile_offset({1, 0});
+    }
+
+    CUTLASS_DEVICE
+    void copy_tiles_and_advance(IteratorA& iterator_A, IteratorB& iterator_B, IteratorScale& iterator_scale,
+        int group_start_A = 0, int group_start_B = 0)
+    {
+        iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+        this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+        // Async Copy for operand A
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::kAccessesPerGroupA; ++j)
+        {
+            if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA)
+            {
+                typename IteratorA::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+                int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value
+                    * IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorA::kAccessesPerVector; ++v)
+                {
+                    auto gmem_ptr = iterator_A.get();
+
+                    if (SharedMemoryClear == SharedMemoryClearOption::kZfill)
+                    {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+                    }
+                    else
+                    {
+                        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+                    }
+
+                    ++iterator_A;
+                }
+
+                ++this->smem_iterator_A_;
+            }
+        }
+
+        iterator_B.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+        this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+        // Async Copy for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::kAccessesPerGroupB; ++j)
+        {
+            if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB)
+            {
+                typename IteratorB::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+                int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value
+                    * IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB::kAccessesPerVector; ++v)
+                {
+                    auto gmem_ptr = iterator_B.get();
+
+                    if (SharedMemoryClear == SharedMemoryClearOption::kZfill)
+                    {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+                    }
+                    else
+                    {
+                        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+                    }
+
+                    ++iterator_B;
+                }
+                ++this->smem_iterator_B_;
+            }
+        }
+    }
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(
+        ///< problem size of GEMM
+        int gemm_k_iterations,
+        ///< destination accumulator tile
+        FragmentC& accum,
+        ///< iterator over A operand in global memory
+        IteratorA iterator_A,
+        ///< iterator over B operand in global memory
+        IteratorB iterator_B,
+        ///< iterator over scale operand in global memory
+        IteratorScale iterator_scale,
+        ///< initial value of accumulator
+        FragmentC const& src_accum)
+    {
+
+        //
+        // Prologue
+        //
+
+        TransformBAfterLDS lds_converter;
+
+        // Issue several complete stages
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations)
+        {
+
+            iterator_A.clear_mask(gemm_k_iterations == 0);
+            iterator_B.clear_mask(gemm_k_iterations == 0);
+            iterator_scale.clear_mask(gemm_k_iterations == 0);
+
+            iterator_A.set_iteration_index(0);
+            this->smem_iterator_A_.set_iteration_index(0);
+
+            // Async Copy for operand A
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j)
+            {
+                typename IteratorA::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorA::kAccessesPerVector; ++v)
+                {
+                    int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value
+                        * IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+                    int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+                    cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                        dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+                    ++iterator_A;
+                }
+
+                ++this->smem_iterator_A_;
+            }
+
+            iterator_B.set_iteration_index(0);
+            this->smem_iterator_B_.set_iteration_index(0);
+
+            // Async Copy for operand B
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j)
+            {
+                typename IteratorB::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB::kAccessesPerVector; ++v)
+                {
+                    int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value
+                        * IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+                    cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                        dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+                    ++iterator_B;
+                }
+
+                ++this->smem_iterator_B_;
+            }
+
+            copy_scales_and_advance(iterator_scale, stage, gemm_k_iterations);
+
+            // Move to the next stage
+            iterator_A.add_tile_offset({0, 1});
+            iterator_B.add_tile_offset({1, 0});
+
+            this->smem_iterator_A_.add_tile_offset({0, 1});
+            this->smem_iterator_B_.add_tile_offset({1, 0});
+
+            // Defines the boundary of a stage of cp.async.
+            cutlass::arch::cp_async_fence();
+        }
+
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        //
+        // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
+        // so that all accumulator elements outside the GEMM footprint are zero.
+        //
+
+        if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage)
+        {
+
+            /// Iterator to write threadblock-scoped tile of A operand to shared memory
+            SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+            typename IteratorA::AccessType zero_A;
+            zero_A.clear();
+
+            last_smem_iterator_A.set_iteration_index(0);
+
+            // Async Copy for operand A
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j)
+            {
+
+                typename IteratorA::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorA::AccessType*>(last_smem_iterator_A.get());
+
+                *dst_ptr = zero_A;
+
+                ++last_smem_iterator_A;
+            }
+
+            /// Iterator to write threadblock-scoped tile of B operand to shared memory
+            SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+            typename IteratorB::AccessType zero_B;
+
+            zero_B.clear();
+            last_smem_iterator_B.set_iteration_index(0);
+
+            // Async Copy for operand B
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j)
+            {
+
+                typename IteratorB::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorB::AccessType*>(last_smem_iterator_B.get());
+
+                *dst_ptr = zero_B;
+
+                ++last_smem_iterator_B;
+            }
+        }
+
+        // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 - #committed)
+        cutlass::arch::cp_async_wait<Base::kStages - 2>();
+        __syncthreads();
+
+        // Pair of fragments used to overlap shared memory loads and math
+        // instructions
+        WarpFragmentA warp_frag_A[2];
+        WarpFragmentB warp_frag_B[2];
+        typename Dequantizer::FragmentScale warp_frag_scales;
+        typename Dequantizer::FragmentZero warp_frag_zeros;
+
+        Operator warp_mma;
+
+        this->warp_tile_iterator_A_.set_kgroup_index(0);
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+        warp_dequantizer_.load(warp_frag_scales, warp_frag_zeros);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+        warp_dequantizer_.add_pointer_offset(Shape::kN);
+
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+        iterator_scale.clear_mask(gemm_k_iterations == 0);
+
+        int smem_write_stage_idx = Base::kStages - 1;
+        int smem_read_stage_idx = 0;
+
+        //
+        // Mainloop
+        //
+
+        CUTLASS_GEMM_LOOP
+        for (; gemm_k_iterations > (-Base::kStages + 1);)
+        {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            // Computes a warp-level GEMM on data held in shared memory
+            // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
+            {
+
+                // Load warp-level tiles from shared memory, wrapping to k offset if
+                // this is the last group as the case may be.
+
+                this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+                this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+                ++this->warp_tile_iterator_A_;
+
+                const int warp_tileB_k_compute_offset = warp_mma_k % Base::kNumKIterationsPerWarpBLoad;
+                const int warp_tileB_k_load_offset = warp_mma_k / Base::kNumKIterationsPerWarpBLoad;
+                if (warp_tileB_k_compute_offset == Base::kNumKIterationsPerWarpBLoad - 1)
+                {
+                    this->warp_tile_iterator_B_.set_kgroup_index(
+                        (warp_tileB_k_load_offset + 1) % Base::kWarpGemmIterationsForB);
+                    this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
+                    ++this->warp_tile_iterator_B_;
+                }
+
+                typename TransformBAfterLDS::result_type converted_frag_B
+                    = lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
+                warp_dequantizer_.dequantize(converted_frag_B, warp_frag_scales, warp_frag_zeros);
+
+                run_warp_mma(
+                    warp_mma, accum, warp_frag_A[warp_mma_k % 2], converted_frag_B, accum, warp_tileB_k_compute_offset);
+
+                // Issue global->shared copies for the this stage
+                if (warp_mma_k < Base::kWarpGemmIterations - 1)
+                {
+                    int group_start_iteration_A, group_start_iteration_B;
+
+                    group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+                    group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+                    copy_tiles_and_advance(
+                        iterator_A, iterator_B, iterator_scale, group_start_iteration_A, group_start_iteration_B);
+
+                    // This is the first group of a given stage, so we issue the loads for the B scales immediately.
+                    if (group_start_iteration_B == 0)
+                    {
+                        copy_scales_and_advance(iterator_scale);
+                    }
+                }
+
+                if (warp_mma_k + 2 == Base::kWarpGemmIterations)
+                {
+                    int group_start_iteration_A, group_start_iteration_B;
+                    group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+                    group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+                    copy_tiles_and_advance(
+                        iterator_A, iterator_B, iterator_scale, group_start_iteration_A, group_start_iteration_B);
+
+                    // Inserts a memory fence between stages of cp.async instructions.
+                    cutlass::arch::cp_async_fence();
+
+                    // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 -
+                    // #committed)
+                    arch::cp_async_wait<Base::kStages - 2>();
+                    __syncthreads();
+
+                    // Move to the next stage
+                    iterator_A.add_tile_offset({0, 1});
+                    iterator_B.add_tile_offset({1, 0});
+
+                    this->smem_iterator_A_.add_tile_offset({0, 1});
+                    this->smem_iterator_B_.add_tile_offset({1, 0});
+
+                    // Add negative offsets to return iterators to the 'start' of the
+                    // circular buffer in shared memory
+                    if (smem_write_stage_idx == (Base::kStages - 1))
+                    {
+                        this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+                        this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+                        this->smem_iterator_scale_.add_tile_offset({-Base::kStages, 0});
+                        smem_write_stage_idx = 0;
+                    }
+                    else
+                    {
+                        ++smem_write_stage_idx;
+                    }
+
+                    if (smem_read_stage_idx == (Base::kStages - 1))
+                    {
+                        this->warp_tile_iterator_A_.add_tile_offset(
+                            {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+                        this->warp_tile_iterator_B_.add_tile_offset(
+                            {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterationsForB, 0});
+                        warp_dequantizer_.add_pointer_offset(-Base::kStages * Shape::kN);
+                        smem_read_stage_idx = 0;
+                    }
+                    else
+                    {
+                        ++smem_read_stage_idx;
+                    }
+
+                    --gemm_k_iterations;
+                    iterator_A.clear_mask(gemm_k_iterations == 0);
+                    iterator_B.clear_mask(gemm_k_iterations == 0);
+                    iterator_scale.clear_mask(gemm_k_iterations == 0);
+                }
+            }
+
+            // Load the scale needed for the next tile iteration.
+            warp_dequantizer_.load(warp_frag_scales, warp_frag_zeros);
+            // Update internal pointer to set of scales in shared memory.
+            warp_dequantizer_.add_pointer_offset(Shape::kN);
+        }
+
+        if (SharedMemoryClear == SharedMemoryClearOption::kZfill)
+        {
+            // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
+            cutlass::arch::cp_async_fence();
+            cutlass::arch::cp_async_wait<0>();
+            __syncthreads();
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h b/csrc/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h
new file mode 100644
index 000000000000..5ec515c28712
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/threadblock/dq_mma_multistage_percol.h
@@ -0,0 +1,636 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace threadblock
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+class DqMmaMultistage<Shape_, IteratorA_, SmemIteratorA_, CacheOpA, IteratorB_, SmemIteratorB_, CacheOpB,
+    IteratorScale_, SmemIteratorScale_, ElementC_, LayoutC_, Policy_, Stages, TransformBAfterLDS_, QuantOp_,
+    SharedMemoryClear, std::enable_if_t<!isFinegrained(QuantOp_)>>
+    : public DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_>
+{
+public:
+    ///< Base class
+    using Base = DqMmaBase<Shape_, Policy_, typename IteratorScale_::Element, Stages, QuantOp_>;
+    ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using Shape = Shape_;
+    ///< Iterates over tiles of A operand in global memory
+    using IteratorA = IteratorA_;
+    ///< Iterates over tiles of B operand in global memory
+    using IteratorB = IteratorB_;
+    ///< Data type of accumulator matrix
+    using ElementC = ElementC_;
+    ///< Layout of accumulator matrix
+    using LayoutC = LayoutC_;
+    ///< Policy describing tuning details
+    using Policy = Policy_;
+
+    using IteratorScale = IteratorScale_;
+    using ElementScale = typename IteratorScale::Element;
+    using LayoutScale = typename IteratorScale::Layout;
+
+    using SmemIteratorA = SmemIteratorA_;
+    using SmemIteratorB = SmemIteratorB_;
+    using SmemIteratorScale = SmemIteratorScale_;
+
+    static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+    static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+    using TransformBAfterLDS = TransformBAfterLDS_;
+
+    static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+    //
+    // Dependent types
+    //
+
+    /// Fragment of operand Scale loaded from global memory;
+    using FragmentScale = typename IteratorScale::Fragment;
+
+    /// Fragment of accumulator tile
+    using FragmentC = typename Policy::Operator::FragmentC;
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Minimum architecture is Sm80 to support cp.async
+    using ArchTag = arch::Sm80;
+
+    using Dequantizer = warp::MmaTensorOpDequantizer<Operator, typename Base::WarpGemm, Operand::kB, ElementScale,
+        LayoutScale, 32, QuantOp>;
+
+    /// Complex transform on A operand
+    static ComplexTransform const kTransformA = Operator::kTransformA;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = Operator::kTransformB;
+
+    /// Internal structure exposed for introspection.
+    struct Detail
+    {
+
+        static_assert(Base::kWarpGemmIterations > 1,
+            "The pipelined structure requires at least two warp-level "
+            "GEMM operations.");
+
+        /// Number of cp.async instructions to load one stage of operand A
+        static int const AsyncCopyIterationsPerStageA = IteratorA::ThreadMap::Iterations::kCount;
+
+        /// Number of cp.async instructions to load one stage of operand B
+        static int const AsyncCopyIterationsPerStageB = IteratorB::ThreadMap::Iterations::kCount;
+
+        /// Number of stages
+        static int const kStages = Stages;
+
+        /// Number of cp.async instructions to load on group of operand A
+        static int const kAccessesPerGroupA
+            = (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+        /// Number of cp.async instructions to load on group of operand B
+        static int const kAccessesPerGroupB
+            = (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+    };
+
+private:
+    using WarpFragmentA = typename Operator::FragmentA;
+    using WarpFragmentB = typename Operator::FragmentB;
+    Dequantizer warp_dequantizer_;
+
+    using ElementB = typename IteratorB::Element;
+    using LayoutDetailsForB = kernel::LayoutDetailsB<ElementB, ArchTag>;
+
+    static constexpr bool RequiresTileInterleave
+        = layout::IsColumnMajorTileInterleave<typename LayoutDetailsForB::Layout>::value;
+    static_assert(!RequiresTileInterleave || (RequiresTileInterleave && (Shape::kK == LayoutDetailsForB::ThreadblockK)),
+        "Layout K must match threadblockK");
+
+private:
+    //
+    // Data members
+    //
+
+    /// Iterator to write threadblock-scoped tile of A operand to shared memory
+    SmemIteratorA smem_iterator_A_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB smem_iterator_B_;
+
+    /// Iterator to write threadblock-scoped tile of scale operand to shared memory
+    SmemIteratorScale smem_iterator_scale_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    DqMmaMultistage(
+        ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        typename Base::SharedStorage& shared_storage,
+        ///< Group size for quantization. Not used by this main loop since it assumes per-column
+        const int group_size,
+        ///< ID within the threadblock
+        int thread_idx,
+        ///< ID of warp
+        int warp_idx,
+        ///< ID of each thread within a warp
+        int lane_idx)
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx)
+        , warp_dequantizer_({shared_storage.operand_scale.data(), LayoutScale(Shape::kN)},
+              (warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN)) / Base::WarpCount::kM, lane_idx)
+        , smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx)
+        , smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+        , smem_iterator_scale_(LayoutScale(Shape::kN), shared_storage.operand_scale.data(), {1, Shape::kN}, thread_idx)
+    {
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterationsForB * warp_idx_k, warp_idx_n});
+    }
+
+    CUTLASS_DEVICE
+    void copy_tiles_and_advance(
+        IteratorA& iterator_A, IteratorB& iterator_B, int group_start_A = 0, int group_start_B = 0)
+    {
+        iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+        this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+        // Async Copy for operand A
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::kAccessesPerGroupA; ++j)
+        {
+            if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA)
+            {
+                typename IteratorA::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+                int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value
+                    * IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorA::kAccessesPerVector; ++v)
+                {
+                    auto gmem_ptr = iterator_A.get();
+
+                    if (SharedMemoryClear == SharedMemoryClearOption::kZfill)
+                    {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+                    }
+                    else
+                    {
+                        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(dst_ptr + v, gmem_ptr, iterator_A.valid());
+                    }
+
+                    ++iterator_A;
+                }
+
+                ++this->smem_iterator_A_;
+            }
+        }
+
+        iterator_B.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+        this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+        // Async Copy for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::kAccessesPerGroupB; ++j)
+        {
+            if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB)
+            {
+                typename IteratorB::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+                int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value
+                    * IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB::kAccessesPerVector; ++v)
+                {
+                    auto gmem_ptr = iterator_B.get();
+
+                    if (SharedMemoryClear == SharedMemoryClearOption::kZfill)
+                    {
+                        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+                    }
+                    else
+                    {
+                        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(dst_ptr + v, gmem_ptr, iterator_B.valid());
+                    }
+
+                    ++iterator_B;
+                }
+                ++this->smem_iterator_B_;
+            }
+        }
+    }
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(
+        ///< problem size of GEMM
+        int gemm_k_iterations,
+        ///< destination accumulator tile
+        FragmentC& accum,
+        ///< iterator over A operand in global memory
+        IteratorA iterator_A,
+        ///< iterator over B operand in global memory
+        IteratorB iterator_B,
+        ///< iterator over scale operand in global memory
+        IteratorScale iterator_scale,
+        ///< initial value of accumulator
+        FragmentC const& src_accum)
+    {
+
+        //
+        // Prologue
+        //
+
+        TransformBAfterLDS lds_converter;
+
+        // NOTE - switch to ldg.sts
+        // Issue this first, so cp.async.commit_group will commit this load as well.
+        // Note: we do not commit here and this load will commit in the same group as
+        //       the first load of A.
+        FragmentScale tb_frag_scales;
+        tb_frag_scales.clear();
+        iterator_scale.load(tb_frag_scales);
+        this->smem_iterator_scale_.store(tb_frag_scales);
+
+        // Issue several complete stages
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations)
+        {
+
+            iterator_A.clear_mask(gemm_k_iterations == 0);
+            iterator_B.clear_mask(gemm_k_iterations == 0);
+
+            iterator_A.set_iteration_index(0);
+            this->smem_iterator_A_.set_iteration_index(0);
+
+            // Async Copy for operand A
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j)
+            {
+                typename IteratorA::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorA::AccessType*>(this->smem_iterator_A_.get());
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorA::kAccessesPerVector; ++v)
+                {
+                    int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value
+                        * IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+                    int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+                    cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                        dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+                    ++iterator_A;
+                }
+
+                ++this->smem_iterator_A_;
+            }
+
+            iterator_B.set_iteration_index(0);
+            this->smem_iterator_B_.set_iteration_index(0);
+
+            // Async Copy for operand B
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j)
+            {
+                typename IteratorB::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorB::AccessType*>(this->smem_iterator_B_.get());
+
+                CUTLASS_PRAGMA_UNROLL
+                for (int v = 0; v < IteratorB::kAccessesPerVector; ++v)
+                {
+                    int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value
+                        * IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+                    cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                        dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+                    ++iterator_B;
+                }
+
+                ++this->smem_iterator_B_;
+            }
+
+            // Move to the next stage
+            iterator_A.add_tile_offset({0, 1});
+            iterator_B.add_tile_offset({1, 0});
+
+            this->smem_iterator_A_.add_tile_offset({0, 1});
+            this->smem_iterator_B_.add_tile_offset({1, 0});
+
+            // Defines the boundary of a stage of cp.async.
+            cutlass::arch::cp_async_fence();
+        }
+
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        //
+        // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
+        // so that all accumulator elements outside the GEMM footprint are zero.
+        //
+
+        if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage)
+        {
+
+            /// Iterator to write threadblock-scoped tile of A operand to shared memory
+            SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+            typename IteratorA::AccessType zero_A;
+            zero_A.clear();
+
+            last_smem_iterator_A.set_iteration_index(0);
+
+            // Async Copy for operand A
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j)
+            {
+
+                typename IteratorA::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorA::AccessType*>(last_smem_iterator_A.get());
+
+                *dst_ptr = zero_A;
+
+                ++last_smem_iterator_A;
+            }
+
+            /// Iterator to write threadblock-scoped tile of B operand to shared memory
+            SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+            typename IteratorB::AccessType zero_B;
+
+            zero_B.clear();
+            last_smem_iterator_B.set_iteration_index(0);
+
+            // Async Copy for operand B
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j)
+            {
+
+                typename IteratorB::AccessType* dst_ptr
+                    = reinterpret_cast<typename IteratorB::AccessType*>(last_smem_iterator_B.get());
+
+                *dst_ptr = zero_B;
+
+                ++last_smem_iterator_B;
+            }
+        }
+
+        // Waits until kStages-2 stages have committed.
+        cutlass::arch::cp_async_wait<Base::kStages - 2>();
+        __syncthreads();
+
+        // Pair of fragments used to overlap shared memory loads and math
+        // instructions
+        WarpFragmentA warp_frag_A[2];
+        WarpFragmentB warp_frag_B[2];
+        typename Dequantizer::FragmentScale warp_frag_scales;
+
+        Operator warp_mma;
+
+        this->warp_tile_iterator_A_.set_kgroup_index(0);
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+        warp_dequantizer_.load(warp_frag_scales);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+
+        int smem_write_stage_idx = Base::kStages - 1;
+        int smem_read_stage_idx = 0;
+
+        //
+        // Mainloop
+        //
+
+        CUTLASS_GEMM_LOOP
+        for (; gemm_k_iterations > (-Base::kStages + 1);)
+        {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            // Computes a warp-level GEMM on data held in shared memory
+            // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
+            {
+
+                // Load warp-level tiles from shared memory, wrapping to k offset if
+                // this is the last group as the case may be.
+
+                this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+                this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+                ++this->warp_tile_iterator_A_;
+
+                const int warp_tileB_k_compute_offset = warp_mma_k % Base::kNumKIterationsPerWarpBLoad;
+                const int warp_tileB_k_load_offset = warp_mma_k / Base::kNumKIterationsPerWarpBLoad;
+                if (warp_tileB_k_compute_offset == Base::kNumKIterationsPerWarpBLoad - 1)
+                {
+                    this->warp_tile_iterator_B_.set_kgroup_index(
+                        (warp_tileB_k_load_offset + 1) % Base::kWarpGemmIterationsForB);
+                    this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
+                    ++this->warp_tile_iterator_B_;
+                }
+
+                typename TransformBAfterLDS::result_type converted_frag_B
+                    = lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
+                warp_dequantizer_.dequantize(converted_frag_B, warp_frag_scales);
+
+                run_warp_mma(
+                    warp_mma, accum, warp_frag_A[warp_mma_k % 2], converted_frag_B, accum, warp_tileB_k_compute_offset);
+
+                // Issue global->shared copies for the this stage
+                if (warp_mma_k < Base::kWarpGemmIterations - 1)
+                {
+                    int group_start_iteration_A, group_start_iteration_B;
+
+                    group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+                    group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+                    copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+                }
+
+                if (warp_mma_k + 2 == Base::kWarpGemmIterations)
+                {
+                    int group_start_iteration_A, group_start_iteration_B;
+                    group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+                    group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+                    copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+
+                    // Inserts a memory fence between stages of cp.async instructions.
+                    cutlass::arch::cp_async_fence();
+
+                    // Waits until kStages-2 stages have committed.
+                    arch::cp_async_wait<Base::kStages - 2>();
+                    __syncthreads();
+
+                    // Move to the next stage
+                    iterator_A.add_tile_offset({0, 1});
+                    iterator_B.add_tile_offset({1, 0});
+
+                    this->smem_iterator_A_.add_tile_offset({0, 1});
+                    this->smem_iterator_B_.add_tile_offset({1, 0});
+
+                    // Add negative offsets to return iterators to the 'start' of the
+                    // circular buffer in shared memory
+                    if (smem_write_stage_idx == (Base::kStages - 1))
+                    {
+                        this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+                        this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+                        smem_write_stage_idx = 0;
+                    }
+                    else
+                    {
+                        ++smem_write_stage_idx;
+                    }
+
+                    if (smem_read_stage_idx == (Base::kStages - 1))
+                    {
+                        this->warp_tile_iterator_A_.add_tile_offset(
+                            {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+                        this->warp_tile_iterator_B_.add_tile_offset(
+                            {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterationsForB, 0});
+                        smem_read_stage_idx = 0;
+                    }
+                    else
+                    {
+                        ++smem_read_stage_idx;
+                    }
+
+                    --gemm_k_iterations;
+                    iterator_A.clear_mask(gemm_k_iterations == 0);
+                    iterator_B.clear_mask(gemm_k_iterations == 0);
+                }
+            }
+        }
+
+        if (SharedMemoryClear == SharedMemoryClearOption::kZfill)
+        {
+            // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
+            cutlass::arch::cp_async_fence();
+            cutlass::arch::cp_async_wait<0>();
+            __syncthreads();
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h b/csrc/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h
new file mode 100644
index 000000000000..e8f5a92c3f02
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h
@@ -0,0 +1,397 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass_extensions/gemm/threadblock/dq_mma_base.h"
+#include "cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h"
+#include "cutlass_extensions/interleaved_numeric_conversion.h"
+
+#include "cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h"
+#include "cutlass_extensions/gemm_configs.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace threadblock
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type for the scales
+    typename IteratorScale_,
+    /// Iterators over scales in shared memory
+    typename SmemIteratorScale_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Converter for B matrix applied immediately after the LDG (before STS)
+    typename TransformBAfterLDG_,
+    /// Converter for B matrix applited immediately after the LDS
+    typename TransformBAfterLDS_,
+    /// The quantization operator being used
+    WeightOnlyQuantOp QuantOp_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DqMmaPipelined : public DqMmaBase<Shape_, Policy_, typename SmemIteratorScale_::Element, 2, QuantOp_>
+{
+public:
+    ///< Base class
+    using Base = DqMmaBase<Shape_, Policy_, typename SmemIteratorScale_::Element, 2, QuantOp_>;
+
+    using Shape = Shape_;         ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+    using IteratorA = IteratorA_; ///< Iterates over tiles of A operand in global memory
+    using IteratorB = IteratorB_; ///< Iterates over tiles of B operand in global memory
+    using ElementC = ElementC_;   ///< Data type of accumulator matrix
+    using LayoutC = LayoutC_;     ///< Layout of accumulator matrix
+    using Policy = Policy_;       ///< Policy describing tuning details
+
+    using IteratorScale = IteratorScale_;
+    using ElementScale = typename IteratorScale::Element;
+    using LayoutScale = typename IteratorScale::Layout;
+
+    using SmemIteratorA = SmemIteratorA_;
+    using SmemIteratorB = SmemIteratorB_;
+    using SmemIteratorScale = SmemIteratorScale_;
+
+    using TransformBAfterLDG = TransformBAfterLDG_;
+    using TransformBAfterLDS = TransformBAfterLDS_;
+
+    static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+    //
+    // Dependent types
+    //
+
+    /// Fragment of operand A loaded from global memory
+    using FragmentA = typename IteratorA::Fragment;
+
+    /// Fragment of operand B loaded from global memory
+    using FragmentB = typename IteratorB::Fragment;
+
+    /// Fragment of operand Scale loaded from global memory;
+    using FragmentScale = typename IteratorScale::Fragment;
+
+    /// Fragment of accumulator tile
+    using FragmentC = typename Policy::Operator::FragmentC;
+
+    /// Warp-level Mma
+    using Operator = typename Policy::Operator;
+
+    /// Obtain the arch tag from the warp-level operator
+    using ArchTag = typename Policy::Operator::ArchTag;
+
+    using Dequantizer = warp::MmaTensorOpDequantizer<Operator, typename Base::WarpGemm, Operand::kB,
+        typename SmemIteratorScale::Fragment::Element, LayoutScale, 32, QuantOp>;
+
+    /// Complex transform on A operand
+    static ComplexTransform const kTransformA = Operator::kTransformA;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = Operator::kTransformB;
+
+    // staticaly assert kStages for DqMmaPipelined is two (Double-buffered pipeline)
+    static_assert((Base::kStages == 2), "DqMmaPipelined requires kStages set to value 2");
+
+private:
+    using WarpFragmentA = typename Operator::FragmentA;
+    using WarpFragmentB = typename Operator::FragmentB;
+    Dequantizer warp_dequantizer_;
+
+    using ElementB = typename IteratorB::Element;
+    using LayoutDetailsForB = kernel::LayoutDetailsB<ElementB, ArchTag>;
+
+    static constexpr bool RequiresTileInterleave
+        = layout::IsColumnMajorTileInterleave<typename LayoutDetailsForB::Layout>::value;
+    static_assert(!RequiresTileInterleave || (RequiresTileInterleave && (Shape::kK == LayoutDetailsForB::ThreadblockK)),
+        "Layout K must match threadblockK");
+
+protected:
+    /// Iterator to write threadblock-scoped tile of A operand to shared memory
+    SmemIteratorA smem_iterator_A_;
+
+    /// Iterator to write threadblock-scoped tile of B operand to shared memory
+    SmemIteratorB smem_iterator_B_;
+
+    /// Iterator to write threadblock-scoped tile of scale operand to shared memory
+    SmemIteratorScale smem_iterator_scale_;
+
+public:
+    /// Construct from tensor references
+    CUTLASS_DEVICE
+    DqMmaPipelined(typename Base::SharedStorage&
+                       shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
+        const int group_size, ///< Will not be used, just to adapt to finegrained modifications and make the compilation
+                              ///< successful. Because DqMmaPipelined is only enabled for sm<80, so even if this
+                              ///< argument is not added, it does not affect compilation for sm>=80.
+        int thread_idx,       ///< ID within the threadblock
+        int warp_idx,         ///< ID of warp
+        int lane_idx          ///< ID of each thread within a warp
+        )
+        : Base(shared_storage, thread_idx, warp_idx, lane_idx)
+        , warp_dequantizer_({shared_storage.operand_scale.data(), LayoutScale(Shape::kN)},
+              (warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN)) / Base::WarpCount::kM, lane_idx)
+        , smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx)
+        , smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+        , smem_iterator_scale_(LayoutScale(Shape::kN), shared_storage.operand_scale.data(), {1, Shape::kN}, thread_idx)
+    {
+
+        // Compute warp location within threadblock tile by mapping the warp_id to
+        // three coordinates:
+        //   _m: the warp's position within the threadblock along the M dimension
+        //   _n: the warp's position within the threadblock along the N dimension
+        //   _k: the warp's position within the threadblock along the K dimension
+
+        int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+        int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+        int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+        int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+        // Add per-warp offsets in units of warp-level tiles
+        this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+        this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterationsForB * warp_idx_k, warp_idx_n});
+    }
+
+    /// Perform a threadblock-scoped matrix multiply-accumulate
+    CUTLASS_DEVICE
+    void operator()(int gemm_k_iterations, ///< number of iterations of the mainloop
+        FragmentC& accum,                  ///< destination accumulator tile
+        IteratorA iterator_A,              ///< iterator over A operand in global memory
+        IteratorB iterator_B,              ///< iterator over B operand in global memory
+        IteratorScale iterator_scale,      ///< iterator over scale operand in global memory
+        FragmentC const& src_accum)
+    {                                      ///< source accumulator tile
+
+        //
+        // Prologue
+        //
+        TransformBAfterLDG ldg_converter;
+        TransformBAfterLDS lds_converter;
+
+        using TransformA
+            = NumericArrayConverter<typename WarpFragmentA::Element, typename FragmentA::Element, FragmentA::kElements>;
+
+        using TransformScale = NumericArrayConverter<typename SmemIteratorScale::Fragment::Element,
+            typename FragmentScale::Element, FragmentScale::kElements>;
+
+        // These transforms are mainly to handle when we have bfloat activations and weights in GMEM and want
+        // to issue HMMA on architectures older than Ampere. We will convert to FP16 before STS.
+        TransformA transformA;
+        TransformScale transformScale;
+
+        // Perform accumulation in the 'd' output operand
+        accum = src_accum;
+
+        FragmentA tb_frag_A;
+        FragmentB tb_frag_B;
+        FragmentScale tb_frag_scales;
+
+        using WarpFragmentScale = typename Dequantizer::FragmentScale;
+        WarpFragmentScale warp_frag_scales;
+
+        tb_frag_A.clear();
+        tb_frag_B.clear();
+        tb_frag_scales.clear();
+
+        // The last kblock is loaded in the prolog
+        iterator_A.load(tb_frag_A);
+        iterator_B.load(tb_frag_B);
+        iterator_scale.load(tb_frag_scales);
+
+        ++iterator_A;
+        ++iterator_B;
+
+        this->smem_iterator_A_.store(transformA(tb_frag_A));
+        this->smem_iterator_B_.store(ldg_converter(tb_frag_B));
+        this->smem_iterator_scale_.store(transformScale(tb_frag_scales));
+
+        ++this->smem_iterator_A_;
+        ++this->smem_iterator_B_;
+
+        __syncthreads();
+
+        warp_dequantizer_.load(warp_frag_scales);
+
+        // Pair of fragments used to overlap shared memory loads and math instructions
+        WarpFragmentA warp_frag_A[2];
+        WarpFragmentB warp_frag_B[2];
+
+        this->warp_tile_iterator_A_.set_kgroup_index(0);
+        this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        Operator warp_mma;
+
+        int smem_write_stage_idx = 1;
+
+        // Avoid reading out of bounds
+        iterator_A.clear_mask(gemm_k_iterations <= 1);
+        iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+        // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
+        // shared memory loads (which have the tighest latency requirement).
+
+        //
+        // Mainloop
+        //
+
+        // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+        CUTLASS_GEMM_LOOP
+        for (; gemm_k_iterations > 0; --gemm_k_iterations)
+        {
+            //
+            // Loop over GEMM K dimension
+            //
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
+            {
+
+                // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+                // as the case may be.
+
+                if (warp_mma_k == Base::kWarpGemmIterations - 1)
+                {
+
+                    // Write fragments to shared memory
+                    this->smem_iterator_A_.store(transformA(tb_frag_A));
+
+                    this->smem_iterator_B_.store(ldg_converter(tb_frag_B));
+
+                    __syncthreads();
+
+                    ++this->smem_iterator_A_;
+                    ++this->smem_iterator_B_;
+
+                    // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+                    if (smem_write_stage_idx == 1)
+                    {
+                        this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+                        this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+                    }
+                    else
+                    {
+                        this->warp_tile_iterator_A_.add_tile_offset(
+                            {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+                        this->warp_tile_iterator_B_.add_tile_offset(
+                            {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterationsForB, 0});
+                    }
+
+                    smem_write_stage_idx ^= 1;
+                }
+
+                this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+                this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+                ++this->warp_tile_iterator_A_;
+
+                const int warp_tileB_k_compute_offset = warp_mma_k % Base::kNumKIterationsPerWarpBLoad;
+                const int warp_tileB_k_load_offset = warp_mma_k / Base::kNumKIterationsPerWarpBLoad;
+                // We are just about to finish computing on a fragment of B, so initiate the load for the next fragment.
+                if (warp_tileB_k_compute_offset == Base::kNumKIterationsPerWarpBLoad - 1)
+                {
+                    this->warp_tile_iterator_B_.set_kgroup_index(
+                        (warp_tileB_k_load_offset + 1) % Base::kWarpGemmIterationsForB);
+                    this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
+                    ++this->warp_tile_iterator_B_;
+                }
+
+                if (warp_mma_k == 0)
+                {
+
+                    iterator_A.load(tb_frag_A);
+                    iterator_B.load(tb_frag_B);
+
+                    ++iterator_A;
+                    ++iterator_B;
+
+                    // Avoid reading out of bounds if this was the last loop iteration
+                    iterator_A.clear_mask(gemm_k_iterations <= 2);
+                    iterator_B.clear_mask(gemm_k_iterations <= 2);
+                }
+
+                typename TransformBAfterLDS::result_type converted_frag_B
+                    = lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
+                warp_dequantizer_.dequantize(converted_frag_B, warp_frag_scales);
+                run_warp_mma(
+                    warp_mma, accum, warp_frag_A[warp_mma_k % 2], converted_frag_B, accum, warp_tileB_k_compute_offset);
+            }
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/warp/default_mma_tensor_op.h b/csrc/cutlass_extensions/gemm/warp/default_mma_tensor_op.h
new file mode 100644
index 000000000000..c8160c59d2d8
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/warp/default_mma_tensor_op.h
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass_extensions/arch/mma.h"
+#include "cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace warp
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements,
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<WarpShape_, InstructionShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    arch::OpMultiplyAddDequantizeInterleavedBToA, PartitionsK, AccumulatorsInRowMajor>
+{
+
+private:
+    // Shape for computing the FP16s
+    using ComputeInstructionShape = InstructionShape_;
+
+    // Chosen so we get K=16 for int8 and K=32 for int4.
+    static constexpr int LoadInstructionK = 8 * sizeof_bits<ElementA>::value / sizeof_bits<ElementB>::value;
+
+    // Shape for loading the narrow data type from shared memory
+    using LoadInstructionShape = GemmShape<InstructionShape_::kM, InstructionShape_::kN, LoadInstructionK>;
+
+public:
+    using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+        cutlass::arch::Mma<InstructionShape_, 32, ElementA, cutlass::layout::RowMajor, ElementA,
+            cutlass::layout::ColumnMajor, ElementC, cutlass::layout::RowMajor, arch::OpMultiplyAdd>,
+        cutlass::MatrixShape<1, 1>>;
+
+    // Define the warp-level tensor op
+    using Type = cutlass::gemm::warp::MmaTensorOpComputeBWithF16<WarpShape_, ElementA, LayoutA, ElementB, LayoutB,
+        ElementC, LayoutC, Policy, LoadInstructionShape, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h b/csrc/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
new file mode 100644
index 000000000000..8e94516945eb
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
@@ -0,0 +1,301 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace warp
+{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    typename Policy_,
+    /// Instruction shape to override shared memory iterators with
+    typename SharedMemoryInstructionShape_,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaTensorOpComputeBWithF16
+{
+public:
+    /// Shape of warp-level matrix operation (concept: GemmShape)
+    using Shape = Shape_;
+
+    /// Data type of multiplicand A
+    using ElementA = ElementA_;
+
+    /// Layout of multiplicand A
+    using LayoutA = LayoutA_;
+
+    /// Data type of multiplicand B
+    using ElementB = ElementB_;
+
+    /// Layout of multiplicand B
+    using LayoutB = LayoutB_;
+
+    /// Data type of accumulator matrix C
+    using ElementC = ElementC_;
+
+    /// Layout of accumulator matrix C
+    using LayoutC = LayoutC_;
+
+    /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+    using Policy = Policy_;
+
+    /// Underlying matrix multiply operator (concept: arch::Mma)
+    using ArchMmaOperator = typename Policy::Operator;
+
+    /// Indicates math operator
+    using MathOperator = typename ArchMmaOperator::Operator;
+
+    /// Architecture tag from underlying instruction
+    using ArchTag = typename ArchMmaOperator::ArchTag;
+    static_assert((platform::is_same<typename ArchMmaOperator::ElementA, half_t>::value
+                      && platform::is_same<typename ArchMmaOperator::ElementB, half_t>::value)
+            || (platform::is_same<typename ArchMmaOperator::ElementA, bfloat16_t>::value
+                && platform::is_same<typename ArchMmaOperator::ElementB, bfloat16_t>::value
+                && ArchTag::kMinComputeCapability >= 80),
+        "MmaTensorOpCvtBToA only supports underlying HMMA");
+
+    static_assert(platform::is_same<ElementA, half_t>::value
+            || (platform::is_same<ElementA, bfloat16_t>::value && ArchTag::kMinComputeCapability >= 80),
+        "MmaTensorOpCvtBToA only supports Fp16 A or Bf16 A on Ampere+");
+
+    /// Indicates class of matrix operator
+    using OperatorClass = arch::OpClassTensorOp;
+
+    /// Shape of underlying instruction
+    using InstructionShape = typename ArchMmaOperator::Shape;
+
+    /// Instruction shape to override shared memory iterators with
+    using SharedMemoryInstructionShape = SharedMemoryInstructionShape_;
+
+    static_assert(
+        SharedMemoryInstructionShape::kM == InstructionShape::kM, "M dimension of compute instruction must match load");
+    static_assert(
+        SharedMemoryInstructionShape::kN == InstructionShape::kN, "N dimension of compute instruction must match load");
+
+    static constexpr int kExpansionFactor = SharedMemoryInstructionShape::kK / InstructionShape::kK;
+
+    static_assert(!(Shape::kK % SharedMemoryInstructionShape::kK), "");
+
+    /// Complex transform on A operand
+    static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+    /// Number of threads participating in warp-level matrix product
+    static int const kThreadCount = 32;
+
+    /// Number of partitions along K dimension
+    static int const kPartitionsK = PartitionsK_;
+
+public:
+    /// Iterates over the A operand in memory
+    using IteratorA
+        = MmaTensorOpMultiplicandTileIterator<MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+            MatrixShape<InstructionShape::kM, InstructionShape::kK>, Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+    /// Storage for A tile
+    using FragmentA = typename IteratorA::Fragment;
+
+    /// Storage for transformed A tile
+    using TransformedFragmentA = Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+    /// Iterates over the B operand in memory
+    using IteratorB = MmaTensorOpMultiplicandTileIterator<MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB,
+        LayoutB, MatrixShape<SharedMemoryInstructionShape::kK, InstructionShape::kN>, Policy::OpDelta::kRow,
+        kThreadCount, kPartitionsK>;
+
+    /// Storage for B tile
+    using FragmentB = typename IteratorB::Fragment;
+
+    /// Storage for transformed B tile
+    using TransformedFragmentB = Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
+
+    /// Iterates over the C operand in memory
+    using IteratorC = MmaTensorOpAccumulatorTileIterator<MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+        typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+    /// Storage for C tile
+    using FragmentC = typename IteratorC::Fragment;
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<(Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+        (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN>;
+
+public:
+    /// Underlying matrix multiply operator (concept: arch::Mma)
+    ArchMmaOperator mma;
+
+public:
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_DEVICE
+    MmaTensorOpComputeBWithF16() {}
+
+    /// Performs a warp-level matrix multiply-accumulate operation
+    CUTLASS_DEVICE
+    void operator()(FragmentC& D, TransformedFragmentA const& A, TransformedFragmentB const& B, FragmentC const& C,
+        const int warp_tileB_k_offset) const
+    {
+
+        using MmaOperandA = typename ArchMmaOperator::FragmentA;
+        using MmaOperandB = typename ArchMmaOperator::FragmentB;
+        using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+        static_assert(
+            TransformedFragmentB::kElements == MmaOperandB::kElements * kExpansionFactor * MmaIterations::kColumn,
+            "Each thread should have a pack of mma registers for each column iteration AND for the expanded K dim of "
+            "B");
+
+        D = C;
+
+        MmaOperandA const* ptr_A = reinterpret_cast<MmaOperandA const*>(&A);
+        MmaOperandB const* ptr_B = reinterpret_cast<MmaOperandB const*>(&B);
+        MmaOperandC* ptr_D = reinterpret_cast<MmaOperandC*>(&D);
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+        // Serpentine visitation order maximizing reuse of Rb
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n)
+        {
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < MmaIterations::kRow; ++m)
+            {
+
+                int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+
+                int n_offsetB = warp_tileB_k_offset + kExpansionFactor * n;
+                if (AccumulatorsInRowMajor)
+                { // matrix B is reordered
+                    mma(ptr_D[n + m_serpentine * MmaIterations::kColumn], ptr_A[m_serpentine], ptr_B[n_offsetB],
+                        ptr_D[n + m_serpentine * MmaIterations::kColumn]);
+                }
+                else
+                {
+                    mma(ptr_D[m_serpentine + n * MmaIterations::kRow], ptr_A[m_serpentine], ptr_B[n_offsetB],
+                        ptr_D[m_serpentine + n * MmaIterations::kRow]);
+                }
+            }
+        }
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+        // Serpentine visitation order maximizing reuse of Ra
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < MmaIterations::kRow; ++m)
+        {
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < MmaIterations::kColumn; ++n)
+            {
+
+                int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+                int n_serpentine_offsetB = warp_tileB_k_offset + kExpansionFactor * n_serpentine;
+                if (AccumulatorsInRowMajor)
+                { // matrix B is reordered
+                    mma(ptr_D[n_serpentine + m * MmaIterations::kColumn], ptr_A[m], ptr_B[n_serpentine_offsetB],
+                        ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+                }
+                else
+                {
+                    mma(ptr_D[m + n_serpentine * MmaIterations::kRow], ptr_A[m], ptr_B[n_serpentine_offsetB],
+                        ptr_D[m + n_serpentine * MmaIterations::kRow]);
+                }
+            }
+        }
+#else
+        assert(0);
+#endif
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h b/csrc/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
new file mode 100644
index 000000000000..3b3fcd0f2e00
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm/warp/mma_tensorop_dequantizer.h
@@ -0,0 +1,647 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/functional.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass_extensions/weight_only_quant_op.h"
+// FIXME(woosuk)
+#include <cuda_bf16.h>
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace gemm
+{
+namespace warp
+{
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Matrix multiply operator
+    typename MmaOperator_,
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand,
+    /// Data type of Scale elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    ///
+    WeightOnlyQuantOp QuantOp_,
+    ///
+    typename Enable = void>
+class MmaTensorOpDequantizer;
+
+////////////////////////////////////////////////////////////////////////////////
+// Bfloat specialization for Ampere
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<MmaOperator_, Shape_, Operand::kB, bfloat16_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<MmaOperator_::ArchTag::kMinComputeCapability >= 80
+        && platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::ColumnMajor>::value>::type>
+{
+
+public:
+    /// Mma Operator
+    using MmaOperator = MmaOperator_;
+
+    // The architecture specific mma ooperator being used
+    using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+    // Mma Instruction Shape
+    using InstructionShape = typename ArchMmaOperator::Shape;
+
+    // This is the ratio of the load instruction vs the compute instruction.
+    static constexpr int kExpansionFactor = MmaOperator::IteratorB::InstructionShape::kRow / InstructionShape::kK;
+
+    /// Type of the scales
+    using ElementScale = bfloat16_t;
+
+    /// Fragment to hold B data before Mma
+    using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+    // Fragment to hold scale data to apply to B before mma
+    // We need 1 fp16 per matrix iteration in the N dimension
+    static constexpr int kColsPerMmaPerThread = 1;
+    using FragmentScale = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+    using FragmentZero = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+
+    /// Warp mma shape
+    using Shape = Shape_;
+
+    /// Layout of the scales in shared memory
+    using Layout = layout::RowMajor;
+
+    /// TensorRef type for loading element from a tensor
+    using TensorRef = TensorRef<ElementScale, Layout>;
+
+    static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+    CUTLASS_DEVICE
+    MmaTensorOpDequantizer(TensorRef smem_scales, TensorRef smem_zeros, const int warp_idx_n, const int lane_idx)
+    {
+        const int warp_offset = warp_idx_n * Shape::kN;
+        const int quad = lane_idx / 4;
+        const int thread_offset = warp_offset + quad;
+        pointer_scale_ = smem_scales.data() + thread_offset;
+        if constexpr (hasZero(QuantOp))
+        {
+            pointer_zero_ = smem_zeros.data() + thread_offset;
+        }
+    }
+
+    CUTLASS_DEVICE
+    MmaTensorOpDequantizer(TensorRef smem_scales, const int warp_idx_n, const int lane_idx)
+        : MmaTensorOpDequantizer(smem_scales, TensorRef(), warp_idx_n, lane_idx)
+    {
+    }
+
+    CUTLASS_DEVICE
+    void load(FragmentScale& scale_frag)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+        {
+            scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+        }
+    }
+
+    CUTLASS_DEVICE
+    void dequantize(FragmentDequantizedOperand& operand_frag, const FragmentScale& scale_frag)
+    {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && defined(ENABLE_BF16))
+        using _MmaOperandB = typename ArchMmaOperator::FragmentB;
+        using ExpandedMmaOperandB = Array<typename _MmaOperandB::Element, kExpansionFactor * _MmaOperandB::kElements>;
+        static_assert(ExpandedMmaOperandB::kElements * MmaOperator::MmaIterations::kColumn
+                == FragmentDequantizedOperand::kElements,
+            "");
+
+        const __nv_bfloat16* scale_ptr = reinterpret_cast<const __nv_bfloat16*>(&scale_frag);
+        ExpandedMmaOperandB* operand_frag_ptr = reinterpret_cast<ExpandedMmaOperandB*>(&operand_frag);
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+        {
+            static_assert(ExpandedMmaOperandB::kElements % 2 == 0, "");
+
+            __nv_bfloat162 scalex2 = __bfloat162bfloat162(scale_ptr[mma_n_iter]);
+            __nv_bfloat162* operand_bf16x2_ptr = reinterpret_cast<__nv_bfloat162*>(&operand_frag_ptr[mma_n_iter]);
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int ii = 0; ii < ExpandedMmaOperandB::kElements / 2; ++ii)
+            {
+                operand_bf16x2_ptr[ii] = __hmul2(operand_bf16x2_ptr[ii], scalex2);
+            }
+        }
+#else
+        // Slow path not implemented here on purpose. If we need to do HMMA on older arch, scale conversion should
+        // happen before scales are stored to shared memory and we should use the fp16 dequantizer. This will avoid
+        // numerous conversion instructions in GEMM main loop.
+        arch::device_breakpoint();
+#endif
+    }
+
+    CUTLASS_DEVICE
+    void load(FragmentScale& scale_frag, FragmentScale& zero_frag)
+    {
+        if constexpr (hasZero(QuantOp))
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+            {
+                scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+                zero_frag[mma_n_iter] = pointer_zero_[mma_n_iter * InstructionShape::kN];
+            }
+        }
+        else
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+            {
+                scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+            }
+        }
+    }
+
+    CUTLASS_DEVICE
+    void dequantize(
+        FragmentDequantizedOperand& operand_frag, const FragmentScale& scale_frag, const FragmentScale& zero_frag)
+    {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && defined(ENABLE_BF16))
+        using _MmaOperandB = typename ArchMmaOperator::FragmentB;
+        using ExpandedMmaOperandB = Array<typename _MmaOperandB::Element, kExpansionFactor * _MmaOperandB::kElements>;
+        static_assert(ExpandedMmaOperandB::kElements * MmaOperator::MmaIterations::kColumn
+                == FragmentDequantizedOperand::kElements,
+            "");
+
+        const __nv_bfloat16* scale_ptr = reinterpret_cast<const __nv_bfloat16*>(&scale_frag);
+        const __nv_bfloat16* zero_ptr = reinterpret_cast<const __nv_bfloat16*>(&zero_frag);
+
+        ExpandedMmaOperandB* operand_frag_ptr = reinterpret_cast<ExpandedMmaOperandB*>(&operand_frag);
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+        {
+            static_assert(ExpandedMmaOperandB::kElements % 2 == 0, "");
+
+            __nv_bfloat162 scalex2 = __bfloat162bfloat162(scale_ptr[mma_n_iter]);
+            __nv_bfloat162 zerox2 = __bfloat162bfloat162(zero_ptr[mma_n_iter]);
+            __nv_bfloat162* operand_bf16x2_ptr = reinterpret_cast<__nv_bfloat162*>(&operand_frag_ptr[mma_n_iter]);
+
+            if constexpr (hasZero(QuantOp))
+            {
+                CUTLASS_PRAGMA_UNROLL
+                for (int ii = 0; ii < ExpandedMmaOperandB::kElements / 2; ++ii)
+                {
+                    operand_bf16x2_ptr[ii] = __hfma2(operand_bf16x2_ptr[ii], scalex2, zerox2);
+                }
+            }
+            else
+            {
+                CUTLASS_PRAGMA_UNROLL
+                for (int ii = 0; ii < ExpandedMmaOperandB::kElements / 2; ++ii)
+                {
+                    operand_bf16x2_ptr[ii] = __hmul2(operand_bf16x2_ptr[ii], scalex2);
+                }
+            }
+        }
+#else
+        // Slow path not implemented here on purpose. If we need to do HMMA on older arch, scale conversion should
+        // happen before scales are stored to shared memory and we should use the fp16 dequantizer. This will avoid
+        // numerous conversion instructions in GEMM main loop.
+        arch::device_breakpoint();
+#endif
+    }
+
+    // Adds a pointer offset in units of elements.
+    CUTLASS_DEVICE
+    void add_pointer_offset(int64_t const& offset)
+    {
+        static_assert(sizeof(ElementScale) > 1, "");
+        pointer_scale_ += offset;
+        pointer_zero_ += offset;
+    }
+
+private:
+    ElementScale const* pointer_scale_;
+    ElementScale const* pointer_zero_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for Turing & Ampere
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<MmaOperator_, Shape_, Operand::kB, half_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<MmaOperator_::ArchTag::kMinComputeCapability >= 75
+        && platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::ColumnMajor>::value>::type>
+{
+
+public:
+    /// Mma Operator
+    using MmaOperator = MmaOperator_;
+
+    // The architecture specific mma ooperator being used
+    using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+    // Mma Instruction Shape
+    using InstructionShape = typename ArchMmaOperator::Shape;
+
+    // This is the ratio of the load instruction vs the compute instruction.
+    static constexpr int kExpansionFactor = MmaOperator::IteratorB::InstructionShape::kRow / InstructionShape::kK;
+
+    /// Type of the scales
+    using ElementScale = half_t;
+
+    /// Fragment to hold B data before Mma
+    using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+    // Fragment to hold scale data to apply to B before mma
+    // We need 1 fp16 per matrix iteration in the N dimension
+    static constexpr int kColsPerMmaPerThread = 1;
+    using FragmentScale = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+    using FragmentZero = Array<ElementScale, kColsPerMmaPerThread * MmaOperator::MmaIterations::kColumn>;
+
+    /// Warp mma shape
+    using Shape = Shape_;
+
+    /// Layout of the scales in shared memory
+    using Layout = layout::RowMajor;
+
+    /// TensorRef type for loading element from a tensor
+    using TensorRef = TensorRef<ElementScale, Layout>;
+
+    static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+
+    CUTLASS_DEVICE
+    MmaTensorOpDequantizer(TensorRef smem_scales, TensorRef smem_zeros, const int warp_idx_n, const int lane_idx)
+    {
+        const int warp_offset = warp_idx_n * Shape::kN;
+        const int quad = lane_idx / 4;
+        const int thread_offset = warp_offset + quad;
+        pointer_scale_ = smem_scales.data() + thread_offset;
+        if constexpr (hasZero(QuantOp))
+        {
+            pointer_zero_ = smem_zeros.data() + thread_offset;
+        }
+    }
+
+    CUTLASS_DEVICE
+    MmaTensorOpDequantizer(TensorRef smem_scales, const int warp_idx_n, const int lane_idx)
+        : MmaTensorOpDequantizer(smem_scales, TensorRef(), warp_idx_n, lane_idx)
+    {
+    }
+
+    CUTLASS_DEVICE
+    void load(FragmentScale& scale_frag)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+        {
+            scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+        }
+    }
+
+    CUTLASS_DEVICE
+    void dequantize(FragmentDequantizedOperand& operand_frag, const FragmentScale& scale_frag)
+    {
+        using _MmaOperandB = typename ArchMmaOperator::FragmentB;
+        using ExpandedMmaOperandB = Array<typename _MmaOperandB::Element, kExpansionFactor * _MmaOperandB::kElements>;
+        static_assert(ExpandedMmaOperandB::kElements * MmaOperator::MmaIterations::kColumn
+                == FragmentDequantizedOperand::kElements,
+            "");
+
+        multiplies<ExpandedMmaOperandB> mul_op;
+
+        ExpandedMmaOperandB* operand_frag_ptr = reinterpret_cast<ExpandedMmaOperandB*>(&operand_frag);
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+        {
+            operand_frag_ptr[mma_n_iter] = mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]);
+        }
+    }
+
+    CUTLASS_DEVICE
+    void load(FragmentScale& scale_frag, FragmentScale& zero_frag)
+    {
+        if constexpr (hasZero(QuantOp))
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+            {
+                scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+                zero_frag[mma_n_iter] = pointer_zero_[mma_n_iter * InstructionShape::kN];
+            }
+        }
+        else
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+            {
+                scale_frag[mma_n_iter] = pointer_scale_[mma_n_iter * InstructionShape::kN];
+            }
+        }
+    }
+
+    CUTLASS_DEVICE
+    void dequantize(
+        FragmentDequantizedOperand& operand_frag, const FragmentScale& scale_frag, const FragmentScale& zero_frag)
+    {
+        using _MmaOperandB = typename ArchMmaOperator::FragmentB;
+        using ExpandedMmaOperandB = Array<typename _MmaOperandB::Element, kExpansionFactor * _MmaOperandB::kElements>;
+        static_assert(ExpandedMmaOperandB::kElements * MmaOperator::MmaIterations::kColumn
+                == FragmentDequantizedOperand::kElements,
+            "");
+
+        multiplies<ExpandedMmaOperandB> mul_op;
+        ExpandedMmaOperandB* operand_frag_ptr = reinterpret_cast<ExpandedMmaOperandB*>(&operand_frag);
+
+        if constexpr (hasZero(QuantOp))
+        {
+            plus<ExpandedMmaOperandB> plus_op;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+            {
+                operand_frag_ptr[mma_n_iter]
+                    = plus_op(mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]), zero_frag[mma_n_iter]);
+            }
+        }
+        else
+        {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n_iter = 0; mma_n_iter < MmaOperator::MmaIterations::kColumn; ++mma_n_iter)
+            {
+                operand_frag_ptr[mma_n_iter] = mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]);
+            }
+        }
+    }
+
+    // Adds a pointer offset in units of elements.
+    CUTLASS_DEVICE
+    void add_pointer_offset(int64_t const& offset)
+    {
+        static_assert(sizeof(ElementScale) > 1, "");
+        pointer_scale_ += offset;
+        pointer_zero_ += offset;
+    }
+
+private:
+    ElementScale const* pointer_scale_;
+    ElementScale const* pointer_zero_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for Volta A x RowMajor B tensorOp, for 32x32x4 interleaved gemm
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<MmaOperator_, Shape_, Operand::kB, half_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<platform::is_same<typename MmaOperator_::ArchTag, arch::Sm70>::value
+        && platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::RowMajor>::value>::type>
+{
+
+public:
+    static_assert(platform::is_same<typename MmaOperator_::InterleavedTileShape, GemmShape<32, 32, 4>>::value, "");
+
+    /// Mma Operator
+    using MmaOperator = MmaOperator_;
+
+    // The architecture specific mma ooperator being used
+    using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+    // Mma Instruction Shape
+    using InstructionShape = typename ArchMmaOperator::Shape;
+
+    /// Type of the scales
+    using ElementScale = half_t;
+
+    /// Fragment to hold B data before Mma
+    using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+    /// Warp mma shape
+    using Shape = Shape_;
+
+    // Fragment to hold scale data to apply to B before mma
+    // Each 32x32x4 matmul uses 8 elements from B.
+    static constexpr int ColsPerMmaTile = 32;
+    static constexpr int TileNIterations = Shape::kN / ColsPerMmaTile;
+    using FragmentScale = Array<ElementScale, TileNIterations * 8>;
+    using AccessType = Array<ElementScale, 8>;
+
+    /// Layout of the scales in shared memory
+    using Layout = layout::RowMajor;
+
+    /// TensorRef type for loading element from a tensor
+    using TensorRef = TensorRef<ElementScale, Layout>;
+
+    static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+    static_assert(QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+    CUTLASS_DEVICE
+    MmaTensorOpDequantizer(TensorRef smem_scales, const int warp_idx_n, const int lane_idx)
+    {
+        const int warp_offset = warp_idx_n * Shape::kN;
+        const int base_col = lane_idx & 0xF8;
+        const int thread_offset = warp_offset + base_col;
+        pointer_ = smem_scales.data() + thread_offset;
+    }
+
+    CUTLASS_DEVICE
+    void load(FragmentScale& scale_frag)
+    {
+        AccessType* scale_frag_ptr = reinterpret_cast<AccessType*>(&scale_frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int tile_iter = 0; tile_iter < TileNIterations; ++tile_iter)
+        {
+            // We jump by 32 here since volta does <32x32x4> super mmas inside a warp.
+            scale_frag_ptr[tile_iter] = *reinterpret_cast<AccessType const*>(pointer_ + ColsPerMmaTile * tile_iter);
+        }
+    }
+
+    CUTLASS_DEVICE
+    void dequantize(FragmentDequantizedOperand& operand_frag, const FragmentScale& scale_frag)
+    {
+        static_assert(FragmentScale::kElements == FragmentDequantizedOperand::kElements, "");
+
+        multiplies<FragmentDequantizedOperand> mul_op;
+        operand_frag = mul_op(operand_frag, scale_frag);
+    }
+
+private:
+    ElementScale const* pointer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for Volta A x ColumnMajor B tensorOp, for 32x32x4 interleaved gemm
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    ///
+    WeightOnlyQuantOp QuantOp_>
+class MmaTensorOpDequantizer<MmaOperator_, Shape_, Operand::kB, half_t, layout::RowMajor, 32, QuantOp_,
+    typename platform::enable_if<platform::is_same<typename MmaOperator_::ArchTag, arch::Sm70>::value
+        && platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::ColumnMajor>::value>::type>
+{
+
+public:
+    static_assert(platform::is_same<typename MmaOperator_::InterleavedTileShape, GemmShape<32, 32, 4>>::value, "");
+
+    /// Mma Operator
+    using MmaOperator = MmaOperator_;
+
+    // The architecture specific mma ooperator being used
+    using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+    // Mma Instruction Shape
+    using InstructionShape = typename ArchMmaOperator::Shape;
+
+    /// Type of the scales
+    using ElementScale = half_t;
+
+    /// Fragment to hold B data before Mma
+    using FragmentDequantizedOperand = Array<ElementScale, MmaOperator::FragmentB::kElements>;
+
+    /// Warp mma shape
+    using Shape = Shape_;
+
+    // Fragment to hold scale data to apply to B before mma
+    // Each 32x32x4 matmul uses 8 elements from B.
+    static constexpr int ColsPerMmaTile = 32;
+    static constexpr int TileNIterations = Shape::kN / ColsPerMmaTile;
+    using FragmentScale = Array<ElementScale, TileNIterations * 2>;
+
+    /// Layout of the scales in shared memory
+    using Layout = layout::RowMajor;
+
+    /// TensorRef type for loading element from a tensor
+    using TensorRef = TensorRef<ElementScale, Layout>;
+
+    static constexpr WeightOnlyQuantOp QuantOp = QuantOp_;
+    static_assert(QuantOp == WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY, "");
+
+    CUTLASS_DEVICE
+    MmaTensorOpDequantizer(TensorRef smem_scales, const int warp_idx_n, const int lane_idx)
+    {
+        const int warp_offset = warp_idx_n * Shape::kN;
+        const int base_col = lane_idx & 0xF8 + lane_idx % 4;
+        const int thread_offset = warp_offset + base_col;
+        pointer_ = smem_scales.data() + thread_offset;
+    }
+
+    CUTLASS_DEVICE
+    void load(FragmentScale& scale_frag)
+    {
+        CUTLASS_PRAGMA_UNROLL
+        for (int tile_iter = 0; tile_iter < TileNIterations; ++tile_iter)
+        {
+            // We jump by 32 here since volta does <32x32x4> super mmas inside a warp.
+            // For col major B, each thread will jump 4 cols to get its next value inside
+            // of the super mma.
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_iter = 0; mma_iter < 2; ++mma_iter)
+            {
+                scale_frag[tile_iter * 2 + mma_iter] = pointer_[ColsPerMmaTile * tile_iter + 4 * mma_iter];
+            }
+        }
+    }
+
+    CUTLASS_DEVICE
+    void dequantize(FragmentDequantizedOperand& operand_frag, const FragmentScale& scale_frag)
+    {
+        using MmaOperandB = typename ArchMmaOperator::FragmentB;
+        static constexpr int total_n_mmas = 2 * TileNIterations;
+        static_assert(MmaOperandB::kElements * total_n_mmas == FragmentDequantizedOperand::kElements, "");
+
+        multiplies<MmaOperandB> mul_op;
+
+        MmaOperandB* operand_frag_ptr = reinterpret_cast<MmaOperandB*>(&operand_frag);
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n_iter = 0; mma_n_iter < total_n_mmas; ++mma_n_iter)
+        {
+            operand_frag_ptr[mma_n_iter] = mul_op(operand_frag_ptr[mma_n_iter], scale_frag[mma_n_iter]);
+        }
+    }
+
+private:
+    ElementScale const* pointer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/gemm_configs.h b/csrc/cutlass_extensions/gemm_configs.h
new file mode 100644
index 000000000000..11180c4260d8
--- /dev/null
+++ b/csrc/cutlass_extensions/gemm_configs.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace tensorrt_llm
+{
+namespace cutlass_extensions
+{
+// Note: The shapes are in the format MxNxK. The K shape of the runtime config MUST match the K shape
+//       in the kernel layout details when doing weight only quantization.
+enum class CutlassTileConfig
+{
+    // Signals that we should run heuristics do choose a config
+    Undefined,
+
+    // Signals that we should run heuristics do choose a config
+    ChooseWithHeuristic,
+
+    // SiMT config
+    CtaShape128x128x8_WarpShape64x64x8,
+
+    // TensorCore configs CTA_N = 128, CTA_K = 64
+    // Warp configs for M=32
+    CtaShape32x128x64_WarpShape32x32x64,
+
+    // Warp configs for M=64
+    CtaShape64x128x64_WarpShape32x64x64,
+    CtaShape64x64x128_WarpShape32x64x64,
+    CtaShape64x128x64_WarpShape64x32x64,
+
+    // Warp configs for M=128
+    CtaShape128x64x64_WarpShape64x32x64,
+    CtaShape128x128x64_WarpShape64x32x64,
+    CtaShape128x128x64_WarpShape64x64x64,
+    CtaShape128x128x64_WarpShape128x32x64,
+    CtaShape128x256x64_WarpShape64x64x64,
+
+    // Warp configs for M=256
+    CtaShape256x128x64_WarpShape64x64x64
+};
+
+enum class SplitKStyle
+{
+    NO_SPLIT_K,
+    SPLIT_K_SERIAL,
+    // SPLIT_K_PARALLEL // Not supported yet
+};
+
+struct CutlassGemmConfig
+{
+    CutlassTileConfig tile_config = CutlassTileConfig::ChooseWithHeuristic;
+    SplitKStyle split_k_style = SplitKStyle::NO_SPLIT_K;
+    int split_k_factor = -1;
+    int stages = -1;
+};
+
+} // namespace cutlass_extensions
+} // namespace tensorrt_llm
diff --git a/csrc/cutlass_extensions/interleaved_numeric_conversion.h b/csrc/cutlass_extensions/interleaved_numeric_conversion.h
new file mode 100644
index 000000000000..44ba79680e69
--- /dev/null
+++ b/csrc/cutlass_extensions/interleaved_numeric_conversion.h
@@ -0,0 +1,447 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Boost-like numeric conversion operator for int8 and CUTLASS int4b_t interleaved in a register
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/numeric_types.h"
+
+namespace cutlass
+{
+
+// This converter is meant to be used with data interleaved in a 32-bit register where the even elements are in the low
+// bits and the odd elemeents are in the high bits of the register. In addition, it assumes elements were originally
+// signed and had a bias of 2**(b-1) added (where b is the number of bits in the type) to make all numbers unsigned.
+// This converter will uninterleave the data and subtract the bias while converting to the result type.
+template <typename T, typename S, int N>
+struct FastInterleavedAndBiasedNumericArrayConverter
+{
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint8_t, 4>
+{
+    using result_type = Array<half_t, 4>;
+    using source_type = Array<uint8_t, 4>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source)
+    {
+        result_type result;
+
+        uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+        uint32_t const i8s = reinterpret_cast<uint32_t const&>(source);
+
+        static constexpr uint32_t mask_for_elt_01 = 0x5250;
+        static constexpr uint32_t mask_for_elt_23 = 0x5351;
+        static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+        asm volatile("prmt.b32 %0,%1,%2,%3;\n" : "=r"(h[0]) : "r"(i8s), "n"(start_byte_for_fp16), "n"(mask_for_elt_01));
+        asm volatile("prmt.b32 %0,%1,%2,%3;\n" : "=r"(h[1]) : "r"(i8s), "n"(start_byte_for_fp16), "n"(mask_for_elt_23));
+
+        // Lastly, we subtract 1152 from our constructed number using fp16 math to get our signed integer as fp16.
+        static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(I8s_TO_F16s_MAGIC_NUM));
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[1]) : "r"(h[1]), "r"(I8s_TO_F16s_MAGIC_NUM));
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s)
+    {
+        return convert(s);
+    }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint8_t, N>
+{
+    static constexpr int VEC_WIDTH = 4;
+    static_assert(!(N % VEC_WIDTH), "N must be multiple of 4.");
+
+    using result_type = Array<half_t, N>;
+    using source_type = Array<uint8_t, N>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source)
+    {
+        using scalar_result_type = typename result_type::Element;
+        using scalar_source_type = typename source_type::Element;
+        FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH>
+            convert_vector_;
+
+        result_type result;
+        using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+        using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+        vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+        vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < N / VEC_WIDTH; ++i)
+        {
+            result_ptr[i] = convert_vector_(source_ptr[i]);
+        }
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s)
+    {
+        return convert(s);
+    }
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint8_t, 4>
+{
+    using result_type = Array<bfloat16_t, 4>;
+    using source_type = Array<uint8_t, 4>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source)
+    {
+        result_type result;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
+        uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&result);
+        uint32_t const i8s = reinterpret_cast<uint32_t const&>(source);
+
+        static constexpr uint32_t fp32_base = 0x4B000000;
+        float fp32_intermediates[4];
+
+        // Construct FP32s, bfloat does not have enough mantissa for IADD trick
+        uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+        fp32_intermediates_casted[0] = __byte_perm(i8s, fp32_base, 0x7650);
+        fp32_intermediates_casted[1] = __byte_perm(i8s, fp32_base, 0x7652);
+        fp32_intermediates_casted[2] = __byte_perm(i8s, fp32_base, 0x7651);
+        fp32_intermediates_casted[3] = __byte_perm(i8s, fp32_base, 0x7653);
+
+        // Subtract out fp32_base + 128 to make the unsigned integer signed.
+        CUTLASS_PRAGMA_UNROLL
+        for (int ii = 0; ii < 4; ++ii)
+        {
+            fp32_intermediates[ii] -= 8388736.f;
+        }
+
+        // Truncate the fp32 representation and pack up as bfloat16s.
+        CUTLASS_PRAGMA_UNROLL
+        for (int ii = 0; ii < 2; ++ii)
+        {
+            bf16_result_ptr[ii]
+                = __byte_perm(fp32_intermediates_casted[2 * ii + 0], fp32_intermediates_casted[2 * ii + 1], 0x7632);
+        }
+#else
+        // Disable this on architectures older than Ampere since they lack hardware for bf16 mma. If one wishes to use
+        // HMMA on older hardware, they should Convert directly to FP16 using FP16 converters.
+        result.clear(); // Suppress compiler warning
+        arch::device_breakpoint();
+#endif
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s)
+    {
+        return convert(s);
+    }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint8_t, N>
+{
+    static constexpr int VEC_WIDTH = 4;
+    static_assert(!(N % VEC_WIDTH), "N must be multiple of 4.");
+
+    using result_type = Array<bfloat16_t, N>;
+    using source_type = Array<uint8_t, N>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source)
+    {
+        using scalar_result_type = typename result_type::Element;
+        using scalar_source_type = typename source_type::Element;
+        FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH>
+            convert_vector_;
+
+        result_type result;
+        using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+        using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+        vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+        vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < N / VEC_WIDTH; ++i)
+        {
+            result_ptr[i] = convert_vector_(source_ptr[i]);
+        }
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s)
+    {
+        return convert(s);
+    }
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint4b_t, 8>
+{
+    using result_type = Array<half_t, 8>;
+    using source_type = Array<uint4b_t, 8>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source)
+    {
+        result_type result;
+
+        uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+        uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
+
+        // First, we extract the i4s and construct an intermediate fp16 number.
+        static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+        static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
+        static constexpr uint32_t TOP_MASK = 0x00f000f0;
+        static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+        // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
+        // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
+        // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
+        // elt_67 to fp16 without having to shift them to the bottom bits before hand.
+
+        // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
+        // immediately before required.
+        const uint32_t top_i4s = i4s >> 8;
+        // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
+        asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                     : "=r"(h[0])
+                     : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+        // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
+        asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                     : "=r"(h[1])
+                     : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+        // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
+        asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                     : "=r"(h[2])
+                     : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+        // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
+        asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                     : "=r"(h[3])
+                     : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+
+        // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
+        // half2 ctor. In this case, I chose performance reliability over code readability.
+
+        // This is the half2 {1032, 1032} represented as an integer.
+        static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
+        // This is the half2 {1 / 16, 1 / 16} represented as an integer.
+        static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
+        // This is the half2 {-72, -72} represented as an integer.
+        static constexpr uint32_t NEG_72 = 0xd480d480;
+
+        // Finally, we construct the output numbers.
+        // Convert elt_01
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+        // Convert elt_23
+        asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_72));
+        // Convert elt_45
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+        // Convert elt_67
+        asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_72));
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s)
+    {
+        return convert(s);
+    }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint4b_t, N>
+{
+    static constexpr int VEC_WIDTH = 8;
+    static_assert(!(N % VEC_WIDTH), "N must be multiple of 8.");
+
+    using result_type = Array<half_t, N>;
+    using source_type = Array<uint4b_t, N>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source)
+    {
+        using scalar_result_type = typename result_type::Element;
+        using scalar_source_type = typename source_type::Element;
+        FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH>
+            convert_vector_;
+
+        result_type result;
+        using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+        using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+        vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+        vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < N / VEC_WIDTH; ++i)
+        {
+            result_ptr[i] = convert_vector_(source_ptr[i]);
+        }
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s)
+    {
+        return convert(s);
+    }
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint4b_t, 8>
+{
+    using result_type = Array<bfloat16_t, 8>;
+    using source_type = Array<uint4b_t, 8>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source)
+    {
+        result_type result;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
+        uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+        uint32_t const source_i4s = reinterpret_cast<uint32_t const&>(source);
+
+        // First, we extract the i4s and construct an intermediate fp16 number.
+        static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+        static constexpr uint32_t MASK = 0x000f000f;
+        static constexpr uint32_t I4s_TO_BF16s_MAGIC_NUM = 0x43004300;
+
+        // We don't have enough mantissa to remove as much shift overhead as FP16, so we must loop.
+        // No shift needed for first item.
+        uint32_t i4s = source_i4s;
+        asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                     : "=r"(h[0])
+                     : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut));
+        CUTLASS_PRAGMA_UNROLL
+        for (int ii = 1; ii < result_type::kElements / 2; ++ii)
+        {
+            i4s >>= sizeof_bits<typename source_type::Element>::value;
+            // (i4s & 0x000f000f) | 0x43004300
+            asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                         : "=r"(h[ii])
+                         : "r"(i4s), "n"(MASK), "n"(I4s_TO_BF16s_MAGIC_NUM), "n"(immLut));
+        }
+
+        // This is the BF16 {-136, -136} represented as an integer.
+        static constexpr uint32_t BF16_BIAS = 0xC308C308;
+        static constexpr uint32_t BF16_ONE = 0x3F803F80;
+
+        // Finally, we construct the output numbers.
+        CUTLASS_PRAGMA_UNROLL
+        for (int ii = 0; ii < result_type::kElements / 2; ++ii)
+        {
+            // Since this section is for Ampere+, we use bf16 fma to do the bias subtraction
+            asm("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[ii]) : "r"(h[ii]), "r"(BF16_ONE), "r"(BF16_BIAS));
+        }
+#else
+        // Disable this on architectures older than Ampere since they lack hardware for bf16 mma. If one wishes to use
+        // HMMA on older hardware, they should Convert directly to FP16 using FP16 converters.
+        arch::device_breakpoint();
+        result.clear(); // Suppress compiler warning.
+#endif
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s)
+    {
+        return convert(s);
+    }
+};
+
+template <int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint4b_t, N>
+{
+    static constexpr int VEC_WIDTH = 8;
+    static_assert(!(N % VEC_WIDTH), "N must be multiple of 8.");
+
+    using result_type = Array<bfloat16_t, N>;
+    using source_type = Array<uint4b_t, N>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source)
+    {
+        using scalar_result_type = typename result_type::Element;
+        using scalar_source_type = typename source_type::Element;
+        FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, VEC_WIDTH>
+            convert_vector_;
+
+        result_type result;
+        using vec_result = Array<scalar_result_type, VEC_WIDTH>;
+        using vec_source = Array<scalar_source_type, VEC_WIDTH>;
+
+        vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+        vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < N / VEC_WIDTH; ++i)
+        {
+            result_ptr[i] = convert_vector_(source_ptr[i]);
+        }
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s)
+    {
+        return convert(s);
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/cutlass_extensions/tile_interleaved_layout.h b/csrc/cutlass_extensions/tile_interleaved_layout.h
new file mode 100644
index 000000000000..5a0cd2957082
--- /dev/null
+++ b/csrc/cutlass_extensions/tile_interleaved_layout.h
@@ -0,0 +1,66 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines new layouts needed for MoE
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/pitch_linear_coord.h"
+
+namespace cutlass
+{
+namespace layout
+{
+
+template <int RowsPerTile, int ColumnsInterleaved>
+struct ColumnMajorTileInterleave
+{
+    static constexpr int kRowsPerTile = RowsPerTile;
+    static constexpr int kColumnsInterleaved = ColumnsInterleaved;
+};
+
+template <class T>
+struct IsColumnMajorTileInterleave
+{
+    static constexpr bool value = false;
+};
+
+template <int U, int V>
+struct IsColumnMajorTileInterleave<ColumnMajorTileInterleave<U, V>>
+{
+    static constexpr bool value = true;
+};
+
+} // namespace layout
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h b/csrc/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h
new file mode 100644
index 000000000000..f8e46f1d2ab7
--- /dev/null
+++ b/csrc/cutlass_extensions/transform/threadblock/fine_grained_scale_zero_iterator.h
@@ -0,0 +1,248 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates for visiting scales to be used when dequantizing the weights for weight-only GEMM
+           quantization.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass
+{
+namespace transform
+{
+namespace threadblock
+{
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape, typename Element, typename Layout, int AdvanceRank, int Alignment>
+class FineGrainedScaleZeroIterator;
+
+template <typename Shape_, typename Element_, int Alignment_>
+class FineGrainedScaleZeroIterator<Shape_, Element_, layout::RowMajor, 0, Alignment_>
+{
+public:
+    using Shape = Shape_;
+    using Element = Element_;
+    using Layout = layout::RowMajor;
+    static int const kAdvanceRank = 0;
+    static int const kAlignment = Alignment_;
+
+    static int const kAccessesPerVector = 1;
+
+    /// Row index of scales corresponding to the groupsize of 64
+    int row_groupsize64_;
+    int group_size_;
+
+    using Index = typename Layout::Index;
+    using LongIndex = typename Layout::LongIndex;
+
+    using TensorRef = TensorRef<Element, Layout>;
+    using TensorView = TensorView<Element, Layout>;
+    using TensorCoord = typename Layout::TensorCoord;
+    using Pointer = Element*;
+    using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+    using AccessType = AlignedArray<Element, kAlignment>;
+
+    // For compatibility with existing iterator interface
+    struct Params
+    {
+        LongIndex stride_ = 0;
+
+        /// amount (in byte) to increment pointer from first access of current tile
+        /// to first access of next tile
+        LongIndex inc_advance_ = 0;
+
+        // Default ctor
+        CUTLASS_HOST_DEVICE
+        Params() {}
+
+        /// Construct the Params object given a pitch-linear tensor's layout
+        CUTLASS_HOST_DEVICE
+        Params(Layout const& layout)
+            : stride_(layout.stride(0))
+        {
+            inc_advance_ = Shape::kRow * stride_ * sizeof_bits<Element>::value / 8;
+        }
+    };
+
+private:
+    /// Internal pointer type permits fast address arithmetic
+    using BytePointer = char*;
+
+private:
+    //
+    // Data members
+    //
+
+    /// Parameters object with precomputed internal state
+    Params const params_;
+
+    /// Internal pointer to first access of tile
+    BytePointer pointer_scale_;
+    BytePointer pointer_zero_;
+
+    bool is_valid_ = false;
+
+public:
+    /// Constructs a TileIterator from its precomputed state, threadblock offset,
+    /// and thread ID
+    CUTLASS_DEVICE
+    FineGrainedScaleZeroIterator(
+        ///< Precomputed parameters object
+        Params const& params,
+        ///< Pointer to start of scale tensor
+        Pointer pointer_scale,
+        ///< Pointer to start of zero tensor
+        Pointer pointer_zero,
+        ///< Extent of the scale and bias
+        TensorCoord extent,
+        ///< ID of each participating thread
+        int thread_id,
+        ///< Initial offset of threadblock
+        TensorCoord const& threadblock_offset,
+        ///< Group size
+        int group_size)
+        : params_(params)
+        , pointer_scale_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer_scale)))
+        , pointer_zero_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer_zero)))
+    {
+        row_groupsize64_ = threadblock_offset.row();
+        group_size_ = group_size;
+
+        const LongIndex tb_row_byte_offset
+            = threadblock_offset.row() / (group_size / 64) * params_.stride_ * sizeof_bits<Element>::value / 8;
+        const LongIndex tb_col_byte_offset = threadblock_offset.column() * sizeof_bits<Element>::value / 8;
+        pointer_scale_ += (tb_row_byte_offset + tb_col_byte_offset);
+
+        if (pointer_zero_ != nullptr)
+        {
+            pointer_zero_ += (tb_row_byte_offset + tb_col_byte_offset);
+        }
+
+        static constexpr int THREADS_PER_ROW = Shape::kColumn / kAlignment;
+
+        const int thread_row = thread_id / THREADS_PER_ROW;
+        const int thread_col = thread_id % THREADS_PER_ROW;
+
+        const LongIndex thread_row_byte_offset = thread_row * params_.stride_ * sizeof_bits<Element>::value / 8;
+        const LongIndex thread_col_byte_offset = thread_col * kAlignment * sizeof_bits<Element>::value / 8;
+        pointer_scale_ += (thread_row_byte_offset + thread_col_byte_offset);
+        if (pointer_zero_ != nullptr)
+        {
+            pointer_zero_ += (thread_row_byte_offset + thread_col_byte_offset);
+        }
+
+        // For the rows, we must check that we are within the extent AND the tile to avoid extra reads on
+        // a given iteration. The same threads will be responsible for issues reads since the number of scales
+        // read in a given iteration is a constant. Therefore, we should never have to update is_valid_
+        // outside of the constructor.
+        const int global_row = threadblock_offset.row() + thread_row;
+        const int global_col = threadblock_offset.column() + thread_col * kAlignment;
+
+        const bool row_in_bounds = global_row < extent.row() && thread_row < Shape::kRow;
+        const bool col_in_bounds = global_col < extent.column();
+
+        is_valid_ = row_in_bounds && col_in_bounds;
+    }
+
+    /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+    CUTLASS_HOST_DEVICE FineGrainedScaleZeroIterator(Params const& params, ///< Precomputed parameters object
+        Pointer pointer_scale,                                             ///< Pointer to start of scale tensor
+        Pointer pointer_zero,                                              ///< Pointer to start of zero tensor
+        TensorCoord extent,                                                ///< Extent of tensor
+        int thread_id,                                                     ///< ID of each participating thread
+        int group_size)
+        : FineGrainedScaleZeroIterator(
+            params, pointer_scale, pointer_zero, extent, thread_id, make_Coord(0, 0), group_size)
+    {
+    }
+
+    CUTLASS_DEVICE
+    void add_tile_offset(TensorCoord const& tile_offset)
+    {
+        const LongIndex row_byte_offset = tile_offset.row() * params_.inc_advance_;
+        const LongIndex col_byte_offset = tile_offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
+        pointer_scale_ += row_byte_offset + col_byte_offset;
+        if (pointer_zero_ != nullptr)
+        {
+            pointer_zero_ += row_byte_offset + col_byte_offset;
+        }
+    }
+
+    /// Clears the predicate set efficiently
+    CUTLASS_HOST_DEVICE void clear_mask(bool enable = true)
+    {
+        is_valid_ &= (!enable);
+    }
+
+    /// Returns whether access is valid or not
+    CUTLASS_HOST_DEVICE
+    bool valid() const
+    {
+        return is_valid_;
+    }
+
+    /// Returns a scale pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get_scale() const
+    {
+        return reinterpret_cast<AccessType*>(pointer_scale_);
+    }
+
+    /// Returns a zero pointer
+    CUTLASS_HOST_DEVICE
+    AccessType* get_zero() const
+    {
+        return reinterpret_cast<AccessType*>(pointer_zero_);
+    }
+};
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/csrc/cutlass_extensions/weight_only_quant_op.h b/csrc/cutlass_extensions/weight_only_quant_op.h
new file mode 100644
index 000000000000..64774428e9f9
--- /dev/null
+++ b/csrc/cutlass_extensions/weight_only_quant_op.h
@@ -0,0 +1,58 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+namespace cutlass
+{
+
+enum class WeightOnlyQuantOp
+{
+    UNDEFINED,
+    PER_COLUMN_SCALE_ONLY,
+    FINEGRAINED_SCALE_ONLY,
+    FINEGRAINED_SCALE_AND_ZEROS
+};
+
+constexpr bool isFinegrained(WeightOnlyQuantOp op)
+{
+    return op == WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS || op == WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY;
+}
+
+constexpr bool hasZero(WeightOnlyQuantOp op)
+{
+    return op == WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS;
+}
+
+} // namespace cutlass
diff --git a/csrc/cutlass_utils/cutlass_heuristic.cpp b/csrc/cutlass_utils/cutlass_heuristic.cpp
new file mode 100644
index 000000000000..6ad1cddc6a90
--- /dev/null
+++ b/csrc/cutlass_utils/cutlass_heuristic.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cutlass_heuristic.h"
+#include <cuda_bf16.h>
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+
+#include <cuda_runtime_api.h>
+#include <vector>
+
+using namespace tensorrt_llm::cutlass_extensions;
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+namespace cutlass_kernels
+{
+
+struct TileShape
+{
+    int m;
+    int n;
+};
+
+TileShape get_cta_shape_for_config(CutlassTileConfig tile_config)
+{
+    switch (tile_config)
+    {
+    case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64: return TileShape{32, 128};
+    case CutlassTileConfig::CtaShape64x64x128_WarpShape32x64x64: return TileShape{64, 64};
+    case CutlassTileConfig::CtaShape64x128x64_WarpShape32x64x64:
+    case CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64: return TileShape{64, 128};
+    case CutlassTileConfig::CtaShape128x64x64_WarpShape64x32x64: return TileShape{128, 64};
+    case CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8:
+    case CutlassTileConfig::CtaShape128x128x64_WarpShape64x32x64:
+    case CutlassTileConfig::CtaShape128x128x64_WarpShape64x64x64:
+    case CutlassTileConfig::CtaShape128x128x64_WarpShape128x32x64: return TileShape{128, 128};
+    case CutlassTileConfig::CtaShape128x256x64_WarpShape64x64x64: return TileShape{128, 256};
+    case CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64: return TileShape{256, 128};
+    default: throw std::runtime_error("[TensorRT-LLm Error][get_grid_shape_for_config] Invalid config");
+    }
+}
+
+bool is_valid_split_k_factor(const int64_t m, const int64_t n, const int64_t k, const TileShape tile_shape,
+    const int split_k_factor, const size_t workspace_bytes, const bool is_weight_only)
+{
+
+    // All tile sizes have a k_tile of 64.
+    static constexpr int k_tile = 64;
+
+    // For weight-only quant, we need k and k_elements_per_split to be a multiple of cta_k
+    if (is_weight_only)
+    {
+        if ((k % k_tile) != 0)
+        {
+            return false;
+        }
+
+        if ((k % split_k_factor) != 0)
+        {
+            return false;
+        }
+
+        const int k_elements_per_split = k / split_k_factor;
+        if ((k_elements_per_split % k_tile) != 0)
+        {
+            return false;
+        }
+    }
+
+    // Check that the workspace has sufficient space for this split-k factor
+    const int ctas_in_m_dim = (m + tile_shape.m - 1) / tile_shape.m;
+    const int ctas_in_n_dim = (n + tile_shape.n - 1) / tile_shape.n;
+    const int required_ws_bytes = split_k_factor == 1 ? 0 : sizeof(int) * ctas_in_m_dim * ctas_in_n_dim;
+
+    if (required_ws_bytes > workspace_bytes)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+std::vector<CutlassTileConfig> get_candidate_tiles(
+    const int sm, const bool is_weight_only, const bool simt_configs_only, const bool int8_configs_only)
+{
+    enum class CutlassGemmType : char
+    {
+        Default,
+        WeightOnly,
+        Simt,
+        Int8
+    };
+
+    CutlassGemmType gemm_type = CutlassGemmType::Default;
+    if (simt_configs_only)
+    {
+        gemm_type = CutlassGemmType::Simt;
+    }
+    else if (is_weight_only)
+    {
+        gemm_type = CutlassGemmType::WeightOnly;
+    }
+    else if (int8_configs_only)
+    {
+        gemm_type = CutlassGemmType::Int8;
+    }
+
+    std::vector<CutlassTileConfig> base_configs{
+        CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64, CutlassTileConfig::CtaShape64x128x64_WarpShape32x64x64};
+    if (sm >= 75)
+    {
+        base_configs.push_back(CutlassTileConfig::CtaShape128x128x64_WarpShape64x32x64);
+    }
+
+    switch (gemm_type)
+    {
+    case CutlassGemmType::Simt: return {CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8};
+    case CutlassGemmType::WeightOnly:
+        if (sm >= 75)
+        {
+            return {CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
+                CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64,
+                CutlassTileConfig::CtaShape128x128x64_WarpShape128x32x64};
+        }
+        else
+        {
+            return {CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
+                CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64};
+        }
+    case CutlassGemmType::Int8:
+        return {CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
+            CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64,
+            CutlassTileConfig::CtaShape128x64x64_WarpShape64x32x64,
+            CutlassTileConfig::CtaShape64x64x128_WarpShape32x64x64,
+            CutlassTileConfig::CtaShape128x256x64_WarpShape64x64x64,
+            CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64};
+    default: return base_configs;
+    }
+}
+
+std::vector<CutlassGemmConfig> get_candidate_configs(int sm, const bool is_weight_only, const bool simt_configs_only,
+    const bool int8_configs_only, const int max_split_k)
+{
+    std::vector<CutlassTileConfig> tiles
+        = get_candidate_tiles(sm, is_weight_only, simt_configs_only, int8_configs_only);
+
+    std::vector<CutlassGemmConfig> candidate_configs;
+    const int min_stages = int8_configs_only ? 3 : 2;
+    const int max_stages = int8_configs_only ? 6 : (sm >= 80 ? 4 : 2);
+    for (const auto& tile_config : tiles)
+    {
+        for (int stages = min_stages; stages <= max_stages; ++stages)
+        {
+            CutlassGemmConfig config{tile_config, SplitKStyle::NO_SPLIT_K, 1, stages};
+            candidate_configs.push_back(config);
+            if (sm >= 75)
+            {
+                for (int split_k_factor = 2; split_k_factor <= max_split_k; ++split_k_factor)
+                {
+                    auto config = CutlassGemmConfig{tile_config, SplitKStyle::SPLIT_K_SERIAL, split_k_factor, stages};
+                    candidate_configs.push_back(config);
+                }
+            }
+        }
+    }
+
+    return candidate_configs;
+}
+
+CutlassGemmConfig estimate_best_config_from_occupancies(const std::vector<CutlassGemmConfig>& candidate_configs,
+    const std::vector<int>& occupancies, const int64_t m, const int64_t n, const int64_t k, const int64_t num_experts,
+    const int split_k_limit, const size_t workspace_bytes, const int multi_processor_count, const int is_weight_only)
+{
+
+    if (occupancies.size() != candidate_configs.size())
+    {
+        throw std::runtime_error(
+            "[TensorRT-LLm Error][estimate_best_config_from_occupancies] occpancies and "
+            "candidate configs vectors must have equal length.");
+    }
+
+    CutlassGemmConfig best_config;
+    // Score will be [0, 1]. The objective is to minimize this score.
+    // It represents the fraction of SM resources unused in the last wave.
+    float config_score = 1.0f;
+    int config_waves = INT_MAX;
+    int current_m_tile = 0;
+
+    const int max_split_k = n >= multi_processor_count * 256 ? 1 : split_k_limit;
+    for (int ii = 0; ii < candidate_configs.size(); ++ii)
+    {
+        CutlassGemmConfig candidate_config = candidate_configs[ii];
+        TileShape tile_shape = get_cta_shape_for_config(candidate_config.tile_config);
+        int occupancy = occupancies[ii];
+
+        if (occupancy == 0)
+        {
+            continue;
+        }
+
+        // Keep small tile sizes when possible.
+        if (best_config.tile_config != CutlassTileConfig::ChooseWithHeuristic && m < current_m_tile
+            && current_m_tile < tile_shape.m)
+        {
+            continue;
+        }
+
+        const int ctas_in_m_dim = (m + tile_shape.m - 1) / tile_shape.m;
+        const int ctas_in_n_dim = (n + tile_shape.n - 1) / tile_shape.n;
+
+        for (int split_k_factor = 1; split_k_factor <= max_split_k; ++split_k_factor)
+        {
+            if (is_valid_split_k_factor(m, n, k, tile_shape, split_k_factor, workspace_bytes, is_weight_only))
+            {
+                const int ctas_per_wave = occupancy * multi_processor_count;
+                const int ctas_for_problem = ctas_in_m_dim * ctas_in_n_dim * split_k_factor;
+
+                const int num_waves_total = (ctas_for_problem + ctas_per_wave - 1) / ctas_per_wave;
+                const float num_waves_fractional = ctas_for_problem / float(ctas_per_wave);
+                const float current_score = float(num_waves_total) - num_waves_fractional;
+
+                const float score_slack = 0.1f;
+                if (current_score < config_score
+                    || ((config_waves > num_waves_total) && (current_score < config_score + score_slack)))
+                {
+                    config_score = current_score;
+                    config_waves = num_waves_total;
+                    SplitKStyle split_style
+                        = split_k_factor > 1 ? SplitKStyle::SPLIT_K_SERIAL : SplitKStyle::NO_SPLIT_K;
+                    best_config = CutlassGemmConfig{
+                        candidate_config.tile_config, split_style, split_k_factor, candidate_config.stages};
+                    current_m_tile = tile_shape.m;
+                }
+                else if (current_score == config_score
+                    && (best_config.stages < candidate_config.stages || split_k_factor < best_config.split_k_factor
+                        || current_m_tile < tile_shape.m))
+                {
+                    // Prefer deeper pipeline or smaller split-k
+                    SplitKStyle split_style
+                        = split_k_factor > 1 ? SplitKStyle::SPLIT_K_SERIAL : SplitKStyle::NO_SPLIT_K;
+                    best_config = CutlassGemmConfig{
+                        candidate_config.tile_config, split_style, split_k_factor, candidate_config.stages};
+                    current_m_tile = tile_shape.m;
+                    config_waves = num_waves_total;
+                }
+            }
+        }
+    }
+
+    if (best_config.tile_config == CutlassTileConfig::ChooseWithHeuristic)
+    {
+        throw std::runtime_error("[TensorRT-LLm Error] Heurisitc failed to find a valid config.");
+    }
+
+    return best_config;
+}
+
+} // namespace cutlass_kernels
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/csrc/cutlass_utils/cutlass_heuristic.h b/csrc/cutlass_utils/cutlass_heuristic.h
new file mode 100644
index 000000000000..95f50c637232
--- /dev/null
+++ b/csrc/cutlass_utils/cutlass_heuristic.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cutlass_extensions/gemm_configs.h"
+
+#include <cinttypes>
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+namespace cutlass_kernels
+{
+
+std::vector<tensorrt_llm::cutlass_extensions::CutlassGemmConfig> get_candidate_configs(int sm,
+    const bool is_weight_only, const bool simt_configs_only, const bool int8_configs_only = false,
+    const int max_split_k = 1);
+
+tensorrt_llm::cutlass_extensions::CutlassGemmConfig estimate_best_config_from_occupancies(
+    const std::vector<tensorrt_llm::cutlass_extensions::CutlassGemmConfig>& candidate_configs,
+    const std::vector<int>& occupancies, const int64_t m, const int64_t n, const int64_t k, const int64_t num_experts,
+    const int split_k_limit, const size_t workspace_bytes, const int multi_processor_count, const int is_weight_only);
+
+} // namespace cutlass_kernels
+} // namespace kernels
+} // namespace tensorrt_llm
diff --git a/csrc/moe/moe_gemm_kernels.h b/csrc/moe/moe_gemm_kernels.h
new file mode 100644
index 000000000000..2cc69f57b8f1
--- /dev/null
+++ b/csrc/moe/moe_gemm_kernels.h
@@ -0,0 +1,82 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cutlass_extensions/gemm_configs.h"
+#include <cuda_runtime_api.h>
+#include <optional>
+
+namespace tensorrt_llm
+{
+
+// Note update moe.py to match
+enum class ActivationType
+{
+    Gelu = 0,
+    Relu,
+    Silu,
+    Swiglu,
+    Geglu,
+    Identity,
+    InvalidType
+};
+
+constexpr bool isGatedActivation(ActivationType activation_type)
+{
+    return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu;
+}
+
+template <typename T, /*The type used for activations/scales/compute*/
+    typename WeightType /* The type for the MoE weights */>
+class MoeGemmRunner
+{
+public:
+    MoeGemmRunner();
+
+    void setBestConfig(std::optional<cutlass_extensions::CutlassGemmConfig> best_config)
+    {
+        best_config_ = std::move(best_config);
+    }
+
+    void moeGemmBiasAct(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+        int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+        ActivationType activation_type, cudaStream_t stream);
+
+    void moeGemm(const T* A, const WeightType* B, const T* weight_scales, T* C, int64_t* total_rows_before_expert,
+        int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, cudaStream_t stream);
+
+    std::vector<cutlass_extensions::CutlassGemmConfig> getConfigs();
+
+private:
+    template <typename EpilogueTag>
+    void dispatchToArch(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+        int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+        cutlass_extensions::CutlassGemmConfig gemm_config, cudaStream_t stream, int* occupancy = nullptr);
+
+    template <typename EpilogueTag>
+    void runGemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+        int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+        cudaStream_t stream);
+
+private:
+    int sm_;
+    int multi_processor_count_;
+    std::optional<cutlass_extensions::CutlassGemmConfig> best_config_{};
+};
+
+} // namespace tensorrt_llm
diff --git a/csrc/moe/moe_gemm_kernels_bf16_bf16.cu b/csrc/moe/moe_gemm_kernels_bf16_bf16.cu
new file mode 100644
index 000000000000..c0ca12814bbb
--- /dev/null
+++ b/csrc/moe/moe_gemm_kernels_bf16_bf16.cu
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "moe_gemm_kernels_template.h"
+
+namespace tensorrt_llm
+{
+template class MoeGemmRunner<__nv_bfloat16, __nv_bfloat16>;
+} // namespace tensorrt_llm
diff --git a/csrc/moe/moe_gemm_kernels_fp16_fp16.cu b/csrc/moe/moe_gemm_kernels_fp16_fp16.cu
new file mode 100644
index 000000000000..ea958cd6cc23
--- /dev/null
+++ b/csrc/moe/moe_gemm_kernels_fp16_fp16.cu
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "moe_gemm_kernels_template.h"
+
+namespace tensorrt_llm
+{
+template class MoeGemmRunner<half, half>;
+}
diff --git a/csrc/moe/moe_gemm_kernels_fp32_fp32.cu b/csrc/moe/moe_gemm_kernels_fp32_fp32.cu
new file mode 100644
index 000000000000..6b27ab8e9c1a
--- /dev/null
+++ b/csrc/moe/moe_gemm_kernels_fp32_fp32.cu
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "moe_gemm_kernels_template.h"
+
+namespace tensorrt_llm
+{
+template class MoeGemmRunner<float, float>;
+}
diff --git a/csrc/moe/moe_gemm_kernels_template.h b/csrc/moe/moe_gemm_kernels_template.h
new file mode 100644
index 000000000000..3a94e1d0ba5d
--- /dev/null
+++ b/csrc/moe/moe_gemm_kernels_template.h
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+
+#include "cutlass_extensions/compute_occupancy.h"
+#include "cutlass_extensions/epilogue_helpers.h"
+#include "cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h"
+#include "cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h"
+#include "cutlass_extensions/gemm/threadblock/default_mma.h"
+
+#include "../cutlass_utils/cutlass_heuristic.h"
+#include "moe_gemm_kernels.h"
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <math.h>
+#include <sstream>
+
+// FIXME(woosuk)
+#define TLLM_THROW(...)                                                                                                \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        throw std::runtime_error("ERROR!");                                                                         \
+    } while (0)
+
+#define TLLM_CHECK_WITH_INFO(...) ;;\
+
+namespace tensorrt_llm
+{
+
+// ============================= Variable batched Gemm things ===========================
+template <typename T, typename WeightType, typename arch, typename EpilogueTag, typename ThreadblockShape,
+    typename WarpShape, int Stages>
+void genericMoeGemmKernelLauncher(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+    int64_t* total_rows_before_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+    cutlass_extensions::CutlassGemmConfig gemm_config, const int multi_processor_count, cudaStream_t stream,
+    int* kernel_occupancy = nullptr)
+{
+#ifdef ENABLE_BF16
+    static_assert(cutlass::platform::is_same<T, __nv_bfloat16>::value || cutlass::platform::is_same<T, half>::value
+            || cutlass::platform::is_same<T, float>::value,
+        "Specialized for bfloat16, half, float");
+#else
+    static_assert(cutlass::platform::is_same<T, half>::value || cutlass::platform::is_same<T, float>::value,
+        "Specialized for half, float");
+#endif
+
+    static_assert(cutlass::platform::is_same<T, WeightType>::value
+            || cutlass::platform::is_same<WeightType, uint8_t>::value
+            || cutlass::platform::is_same<WeightType, cutlass::uint4b_t>::value,
+        "");
+
+    // The cutlass type for the input elements. This is needed to convert to cutlass::half_t if necessary.
+    using ElementType_ =
+        typename cutlass::platform::conditional<cutlass::platform::is_same<T, half>::value, cutlass::half_t, T>::type;
+#ifdef ENABLE_BF16
+    using ElementType =
+        typename cutlass::platform::conditional<cutlass::platform::is_same<ElementType_, __nv_bfloat16>::value,
+            cutlass::bfloat16_t, ElementType_>::type;
+#else
+    using ElementType = ElementType_;
+#endif
+
+    using CutlassWeightType_ =
+        typename cutlass::platform::conditional<cutlass::platform::is_same<WeightType, half>::value, cutlass::half_t,
+            WeightType>::type;
+#ifdef ENABLE_BF16
+    using CutlassWeightType =
+        typename cutlass::platform::conditional<cutlass::platform::is_same<CutlassWeightType_, __nv_bfloat16>::value,
+            cutlass::bfloat16_t, CutlassWeightType_>::type;
+#else
+    using CutlassWeightType = CutlassWeightType_;
+#endif
+
+    // We need separate config for each architecture since we will target different tensorcore instructions. For float,
+    // we do not target TCs.
+    using MixedGemmArchTraits = cutlass::gemm::kernel::MixedGemmArchTraits<ElementType, CutlassWeightType, arch>;
+    using ElementAccumulator = typename MixedGemmArchTraits::AccType;
+
+    using EpilogueOp = typename tensorrt_llm::cutlass_extensions::Epilogue<ElementType,
+        MixedGemmArchTraits::ElementsPerAccessC, ElementAccumulator, EpilogueTag>::Op;
+
+    // Finally, set up the kernel.
+    using GemmKernel_ = typename cutlass::gemm::kernel::DefaultGemmGrouped<ElementType, cutlass::layout::RowMajor,
+        cutlass::ComplexTransform::kNone, MixedGemmArchTraits::ElementsPerAccessA, CutlassWeightType,
+        typename MixedGemmArchTraits::LayoutB, cutlass::ComplexTransform::kNone,
+        MixedGemmArchTraits::ElementsPerAccessB, ElementType, cutlass::layout::RowMajor, ElementAccumulator,
+        typename MixedGemmArchTraits::OperatorClass, arch, ThreadblockShape, WarpShape,
+        typename MixedGemmArchTraits::InstructionShape, EpilogueOp,
+        cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, Stages,
+        cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly, typename MixedGemmArchTraits::Operator>::GemmKernel;
+
+    using GemmKernel = cutlass::gemm::kernel::MoeFCGemm<typename GemmKernel_::Mma, typename GemmKernel_::Epilogue,
+        typename GemmKernel_::ThreadblockSwizzle,
+        arch, // Ensure top level arch is used for dispatch
+        GemmKernel_::kGroupScheduleMode>;
+
+    using GemmGrouped = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+
+    if (kernel_occupancy != nullptr)
+    {
+        *kernel_occupancy = tensorrt_llm::cutlass_extensions::compute_occupancy_for_kernel<GemmKernel>();
+        return;
+    }
+    int occupancy = std::min(2, GemmGrouped::maximum_active_blocks());
+    TLLM_CHECK_WITH_INFO(occupancy != 0, "GPU lacks the shared memory resources to run GroupedGEMM kernel");
+    const int threadblock_count = multi_processor_count * occupancy;
+
+    typename EpilogueOp::Params epilogue_op(
+        ElementAccumulator(1.f), biases ? ElementAccumulator(1.f) : ElementAccumulator(0.f));
+
+    const int group_size = gemm_k;
+    typename GemmGrouped::Arguments args(num_experts, threadblock_count, group_size, epilogue_op,
+        reinterpret_cast<const ElementType*>(A), reinterpret_cast<const CutlassWeightType*>(B),
+        reinterpret_cast<const ElementType*>(weight_scales), reinterpret_cast<const ElementType*>(biases),
+        reinterpret_cast<ElementType*>(C), total_rows_before_expert, gemm_n, gemm_k);
+
+    GemmGrouped gemm;
+
+    auto can_implement = gemm.can_implement(args);
+    TLLM_CHECK_WITH_INFO(can_implement == cutlass::Status::kSuccess,
+        "MoE FC kernel will fail for params. Error: " + std::string(cutlassGetStatusString(can_implement)));
+
+    auto init_status = gemm.initialize(args);
+    TLLM_CHECK_WITH_INFO(init_status == cutlass::Status::kSuccess,
+        "Failed to initialize cutlass variable batched gemm. Error: "
+            + std::string(cutlassGetStatusString(init_status)));
+
+    auto run_status = gemm.run(stream);
+    TLLM_CHECK_WITH_INFO(run_status == cutlass::Status::kSuccess,
+        "Failed to run cutlass variable batched gemm. Error: " + std::string(cutlassGetStatusString(run_status)));
+}
+
+template <typename T, typename WeightType, typename arch, typename EpilogueTag, typename ThreadblockShape,
+    typename WarpShape, int Stages, typename Enable = void>
+struct dispatch_stages
+{
+    static void dispatch(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+        int64_t* total_rows_before_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+        cutlass_extensions::CutlassGemmConfig gemm_config, int multi_processor_count, cudaStream_t stream,
+        int* occupancy = nullptr)
+    {
+        TLLM_THROW("Cutlass fpA_intB gemm. Not instantiated for arch %d with stages set to %d",
+            arch::kMinComputeCapability, Stages);
+    }
+};
+
+template <typename T, typename WeightType, typename arch, typename EpilogueTag, typename ThreadblockShape,
+    typename WarpShape>
+struct dispatch_stages<T, WeightType, arch, EpilogueTag, ThreadblockShape, WarpShape, 2>
+{
+    static void dispatch(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+        int64_t* total_rows_before_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+        cutlass_extensions::CutlassGemmConfig gemm_config, int multi_processor_count, cudaStream_t stream,
+        int* occupancy = nullptr)
+    {
+        genericMoeGemmKernelLauncher<T, WeightType, arch, EpilogueTag, ThreadblockShape, WarpShape, 2>(A, B,
+            weight_scales, biases, C, total_rows_before_expert, num_rows, gemm_n, gemm_k, num_experts, gemm_config,
+            multi_processor_count, stream, occupancy);
+    }
+};
+
+template <typename T, typename WeightType, typename EpilogueTag, typename ThreadblockShape, typename WarpShape,
+    int Stages>
+struct dispatch_stages<T, WeightType, cutlass::arch::Sm80, EpilogueTag, ThreadblockShape, WarpShape, Stages,
+    typename std::enable_if<(Stages > 2)>::type>
+{
+    static void dispatch(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+        int64_t* total_rows_before_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+        cutlass_extensions::CutlassGemmConfig gemm_config, int multi_processor_count, cudaStream_t stream,
+        int* occupancy = nullptr)
+    {
+        genericMoeGemmKernelLauncher<T, WeightType, cutlass::arch::Sm80, EpilogueTag, ThreadblockShape, WarpShape,
+            Stages>(A, B, weight_scales, biases, C, total_rows_before_expert, num_rows, gemm_n, gemm_k, num_experts,
+            gemm_config, multi_processor_count, stream, occupancy);
+    }
+};
+
+template <typename T, typename WeightType, typename arch, typename EpilogueTag, typename ThreadblockShape,
+    typename WarpShape>
+void dispatchGemmConfig(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+    int64_t* total_rows_before_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+    cutlass_extensions::CutlassGemmConfig gemm_config, int multi_processor_count, cudaStream_t stream,
+    int* occupancy = nullptr)
+{
+    switch (gemm_config.stages)
+    {
+    case 2:
+        using DispatcherStages2 = dispatch_stages<T, WeightType, arch, EpilogueTag, ThreadblockShape, WarpShape, 2>;
+        DispatcherStages2::dispatch(A, B, weight_scales, biases, C, total_rows_before_expert, num_rows, gemm_n, gemm_k,
+            num_experts, gemm_config, multi_processor_count, stream, occupancy);
+        break;
+    case 3:
+        using DispatcherStages3 = dispatch_stages<T, WeightType, arch, EpilogueTag, ThreadblockShape, WarpShape, 3>;
+        DispatcherStages3::dispatch(A, B, weight_scales, biases, C, total_rows_before_expert, num_rows, gemm_n, gemm_k,
+            num_experts, gemm_config, multi_processor_count, stream, occupancy);
+        break;
+    case 4:
+        using DispatcherStages4 = dispatch_stages<T, WeightType, arch, EpilogueTag, ThreadblockShape, WarpShape, 4>;
+        DispatcherStages4::dispatch(A, B, weight_scales, biases, C, total_rows_before_expert, num_rows, gemm_n, gemm_k,
+            num_experts, gemm_config, multi_processor_count, stream, occupancy);
+        break;
+    default: TLLM_THROW("dispatchGemmConfig does not support stages %d", gemm_config.stages); break;
+    }
+}
+
+// This overload will handle tensorop gemms. It is disabled via SFINAE for fp32.
+// This overload is only enabled when T == WeightType.
+template <typename T, typename WeightType, typename arch, typename EpilogueTag,
+    typename std::enable_if<!std::is_same<T, float>::value && std::is_same<T, WeightType>::value>::type* = nullptr>
+void dispatchMoeGemmToCutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+    int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+    cutlass_extensions::CutlassGemmConfig gemm_config, int sm_version, int multi_processor_count, cudaStream_t stream,
+    int* occupancy = nullptr)
+{
+    switch (gemm_config.tile_config)
+    {
+    case cutlass_extensions::CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64:
+        dispatchGemmConfig<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<32, 128, 64>,
+            cutlass::gemm::GemmShape<32, 32, 64>>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows,
+            gemm_n, gemm_k, num_experts, gemm_config, multi_processor_count, stream, occupancy);
+        break;
+    case cutlass_extensions::CutlassTileConfig::CtaShape64x128x64_WarpShape32x64x64:
+        dispatchGemmConfig<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<64, 128, 64>,
+            cutlass::gemm::GemmShape<32, 64, 64>>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows,
+            gemm_n, gemm_k, num_experts, gemm_config, multi_processor_count, stream, occupancy);
+        break;
+    case cutlass_extensions::CutlassTileConfig::CtaShape128x128x64_WarpShape64x32x64:
+        dispatchGemmConfig<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<128, 128, 64>,
+            cutlass::gemm::GemmShape<64, 32, 64>>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows,
+            gemm_n, gemm_k, num_experts, gemm_config, multi_processor_count, stream, occupancy);
+        break;
+    case cutlass_extensions::CutlassTileConfig::Undefined: TLLM_THROW("GEMM config undefined."); break;
+    case cutlass_extensions::CutlassTileConfig::ChooseWithHeuristic:
+        TLLM_THROW("GEMM config should have already been set by heuristic.");
+        break;
+    default: TLLM_THROW("Config is invalid for same type tensorop GEMM."); break;
+    }
+}
+
+// Tensorop GEMM overload
+// Overload for quantize MoE GEMMs. We disable some warp configs here since they will not be used and we can improve
+// compile time
+template <typename T, typename WeightType, typename arch, typename EpilogueTag,
+    typename std::enable_if<!std::is_same<T, float>::value && !std::is_same<T, WeightType>::value>::type* = nullptr>
+void dispatchMoeGemmToCutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+    int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+    cutlass_extensions::CutlassGemmConfig gemm_config, int sm_version, int multi_processor_count, cudaStream_t stream,
+    int* occupancy = nullptr)
+{
+    switch (gemm_config.tile_config)
+    {
+    case cutlass_extensions::CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64:
+        dispatchGemmConfig<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<32, 128, 64>,
+            cutlass::gemm::GemmShape<32, 32, 64>>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows,
+            gemm_n, gemm_k, num_experts, gemm_config, multi_processor_count, stream, occupancy);
+        break;
+    case cutlass_extensions::CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64:
+        dispatchGemmConfig<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<64, 128, 64>,
+            cutlass::gemm::GemmShape<64, 32, 64>>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows,
+            gemm_n, gemm_k, num_experts, gemm_config, multi_processor_count, stream, occupancy);
+        break;
+    case cutlass_extensions::CutlassTileConfig::CtaShape128x128x64_WarpShape128x32x64:
+        dispatchGemmConfig<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<128, 128, 64>,
+            cutlass::gemm::GemmShape<128, 32, 64>>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows,
+            gemm_n, gemm_k, num_experts, gemm_config, multi_processor_count, stream, occupancy);
+        break;
+    case cutlass_extensions::CutlassTileConfig::Undefined: TLLM_THROW("GEMM config undefined."); break;
+    case cutlass_extensions::CutlassTileConfig::ChooseWithHeuristic:
+        TLLM_THROW("GEMM config should have already been set by heuristic.");
+        break;
+    default: TLLM_THROW("Config is invalid for mixed type tensorop GEMM."); break;
+    }
+}
+
+// This overload will handle simt gemms. It is disabled via SFINAE for tensorop.
+template <typename T, typename WeightType, typename arch, typename EpilogueTag,
+    typename std::enable_if<std::is_same<T, float>::value>::type* = nullptr>
+void dispatchMoeGemmToCutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C,
+    int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+    cutlass_extensions::CutlassGemmConfig gemm_config, int sm_version, int multi_processor_count, cudaStream_t stream,
+    int* occupancy = nullptr)
+{
+    switch (gemm_config.tile_config)
+    {
+    case cutlass_extensions::CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8:
+        dispatchGemmConfig<T, WeightType, arch, EpilogueTag, cutlass::gemm::GemmShape<128, 128, 8>,
+            cutlass::gemm::GemmShape<64, 64, 8>>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows,
+            gemm_n, gemm_k, num_experts, gemm_config, multi_processor_count, stream, occupancy);
+        break;
+    case cutlass_extensions::CutlassTileConfig::Undefined: TLLM_THROW("GEMM config undefined."); break;
+    case cutlass_extensions::CutlassTileConfig::ChooseWithHeuristic:
+        TLLM_THROW("GEMM config should have already been set by heuristic.");
+        break;
+    default: TLLM_THROW("Unsupported config for float MoE gemm."); break;
+    }
+}
+
+template <typename T, typename WeightType>
+std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType>::getConfigs()
+{
+    static constexpr bool is_weight_only = !std::is_same<T, WeightType>::value;
+    static constexpr bool only_simt_configs = std::is_same<T, float>::value;
+    std::vector<cutlass_extensions::CutlassGemmConfig> candidate_configs
+        = kernels::cutlass_kernels::get_candidate_configs(sm_, is_weight_only, only_simt_configs);
+    return candidate_configs;
+}
+
+template <typename T, typename WeightType>
+MoeGemmRunner<T, WeightType>::MoeGemmRunner()
+{
+    int device{-1};
+    tensorrt_llm::common::check_cuda_error(cudaGetDevice(&device));
+    sm_ = tensorrt_llm::common::getSMVersion();
+    tensorrt_llm::common::check_cuda_error(
+        cudaDeviceGetAttribute(&multi_processor_count_, cudaDevAttrMultiProcessorCount, device));
+}
+
+template <typename T, typename WeightType>
+template <typename EpilogueTag>
+void MoeGemmRunner<T, WeightType>::dispatchToArch<EpilogueTag>(const T* A, const WeightType* B, const T* weight_scales,
+    const T* biases, T* C, int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
+    int num_experts, cutlass_extensions::CutlassGemmConfig gemm_config, cudaStream_t stream, int* occupancy)
+{
+    if (sm_ >= 70 && sm_ < 75)
+    {
+        dispatchMoeGemmToCutlass<T, WeightType, cutlass::arch::Sm70, EpilogueTag>(A, B, weight_scales, biases, C,
+            total_rows_before_expert, total_rows, gemm_n, gemm_k, num_experts, gemm_config, sm_, multi_processor_count_,
+            stream, occupancy);
+    }
+    else if (sm_ >= 75 && sm_ < 80)
+    {
+        dispatchMoeGemmToCutlass<T, WeightType, cutlass::arch::Sm75, EpilogueTag>(A, B, weight_scales, biases, C,
+            total_rows_before_expert, total_rows, gemm_n, gemm_k, num_experts, gemm_config, sm_, multi_processor_count_,
+            stream, occupancy);
+    }
+    else if (sm_ >= 80 && sm_ < 90)
+    {
+        dispatchMoeGemmToCutlass<T, WeightType, cutlass::arch::Sm80, EpilogueTag>(A, B, weight_scales, biases, C,
+            total_rows_before_expert, total_rows, gemm_n, gemm_k, num_experts, gemm_config, sm_, multi_processor_count_,
+            stream, occupancy);
+    }
+    else if (sm_ >= 90)
+    {
+        // TODO Update the arch to Sm90 once CUTLASS hopper specialisations are available
+        dispatchMoeGemmToCutlass<T, WeightType, cutlass::arch::Sm80, EpilogueTag>(A, B, weight_scales, biases, C,
+            total_rows_before_expert, total_rows, gemm_n, gemm_k, num_experts, gemm_config, sm_, multi_processor_count_,
+            stream, occupancy);
+    }
+    else
+    {
+        TLLM_THROW("Arch unsupported for MoE GEMM");
+    }
+}
+
+template <typename T, typename WeightType>
+template <typename EpilogueTag>
+void MoeGemmRunner<T, WeightType>::runGemm<EpilogueTag>(const T* A, const WeightType* B, const T* weight_scales,
+    const T* biases, T* C, int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
+    int num_experts, cudaStream_t stream)
+{
+    auto chosen_conf = this->best_config_;
+    if (!chosen_conf)
+    {
+        auto candidate_configs = getConfigs();
+        std::vector<int> occupancies(candidate_configs.size());
+
+        for (size_t ii = 0; ii < candidate_configs.size(); ++ii)
+        {
+            dispatchToArch<EpilogueTag>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n,
+                gemm_k, num_experts, candidate_configs[ii], stream, &occupancies[ii]);
+        }
+
+        static constexpr int workspace_bytes = 0; // No workspace for MoE GEMMs.
+        static constexpr int split_k_limit = 1;   // MoE GEMM does not support split-k.
+
+        static constexpr bool is_weight_only = !std::is_same<T, WeightType>::value;
+        chosen_conf = kernels::cutlass_kernels::estimate_best_config_from_occupancies(candidate_configs, occupancies,
+            total_rows, gemm_n, gemm_k, num_experts, split_k_limit, workspace_bytes, multi_processor_count_,
+            is_weight_only);
+    }
+    assert(chosen_conf);
+    dispatchToArch<EpilogueTag>(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k,
+        num_experts, *chosen_conf, stream);
+}
+
+template <typename T, typename WeightType>
+void MoeGemmRunner<T, WeightType>::moeGemmBiasAct(const T* A, const WeightType* B, const T* weight_scales,
+    const T* biases, T* C, int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k,
+    int num_experts, ActivationType activation_type, cudaStream_t stream)
+{
+    switch (activation_type)
+    {
+    case ActivationType::Relu:
+        runGemm<cutlass_extensions::EpilogueOpDefaultReLU>(
+            A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k, num_experts, stream);
+        break;
+    case ActivationType::Gelu:
+        runGemm<cutlass_extensions::EpilogueOpDefaultFtGelu>(
+            A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k, num_experts, stream);
+        break;
+    case ActivationType::Silu:
+        runGemm<cutlass_extensions::EpilogueOpDefaultSilu>(
+            A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k, num_experts, stream);
+        break;
+    case ActivationType::Identity:
+        runGemm<cutlass_extensions::EpilogueOpDefault>(
+            A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k, num_experts, stream);
+        break;
+    case ActivationType::InvalidType: TLLM_THROW("Activation type for fpA_intB must be valid."); break;
+    default: TLLM_THROW("Invalid activation type."); break;
+    }
+}
+
+template <typename T, typename WeightType>
+void MoeGemmRunner<T, WeightType>::moeGemm(const T* A, const WeightType* B, const T* weight_scales, T* C,
+    int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts,
+    cudaStream_t stream)
+{
+    runGemm<cutlass_extensions::EpilogueOpDefault>(
+        A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k, num_experts, stream);
+}
+
+} // namespace tensorrt_llm
diff --git a/csrc/moe/moe_mlp_kernels.cu b/csrc/moe/moe_mlp_kernels.cu
new file mode 100644
index 000000000000..78467781afea
--- /dev/null
+++ b/csrc/moe/moe_mlp_kernels.cu
@@ -0,0 +1,171 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../dispatch_utils.h"
+#include "moe_gemm_kernels.h"
+
+#include <cuda.h>
+#include <math.h>
+#include <sstream>
+
+#include "cutlass/array.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass_extensions/epilogue/thread/fused_activations.h"
+
+namespace tensorrt_llm {
+
+// ============================== Gated Activation =================================
+template <class T, class ActFn>
+__global__ void doGatedActivationKernel(
+    T* output, const T* gemm_result, const int64_t* num_valid_tokens_ptr, size_t inter_size)
+{
+    const int tid = threadIdx.x;
+    const int token = blockIdx.x;
+    if (num_valid_tokens_ptr && token >= *num_valid_tokens_ptr)
+    {
+        return;
+    }
+
+    ActFn fn{};
+    output = output + token * inter_size;
+    gemm_result = gemm_result + token * inter_size * 2;
+    for (int i = tid; i < inter_size; i += blockDim.x)
+    {
+        T fc1_value = gemm_result[i];
+        // BF16 isn't supported, use FP32 for activation function
+        float gate_value = gemm_result[i + inter_size];
+        T gate_act = fn(gate_value);
+        output[i] = fc1_value * gate_act;
+    }
+}
+
+template <class T>
+void doGatedActivation(T* output, const T* gemm_result, const int64_t* num_valid_tokens_ptr, int inter_size,
+    int num_tokens, ActivationType activation_type, cudaStream_t stream)
+{
+    const int blocks = num_tokens;
+    const int threads = std::min(inter_size, 1024);
+
+    // TODO Instead of T use a vectored type if performance would benefit
+    // TODO For some reason Volta fails on GELU_taylor here with Warp Illegal Instruction.
+    auto* fn = activation_type == ActivationType::Swiglu
+        ? &doGatedActivationKernel<T, cutlass::epilogue::thread::SiLu<float>>
+        : &doGatedActivationKernel<T, cutlass::epilogue::thread::GELU<float>>;
+    fn<<<blocks, threads, 0, stream>>>(output, gemm_result, num_valid_tokens_ptr, inter_size);
+}
+
+template <typename T>
+void run_moe_mlp(
+    T* moe_output,
+    T* fc1_output,
+    T* glu_output,
+    const T* input_tokens,
+    int64_t* cum_num_tokens_per_expert,
+    const T* fc1_expert_weights,
+    const T* fc1_expert_biases,
+    ActivationType fc1_activation_type,
+    const T* fc2_expert_weights,
+    const int64_t num_expanded_tokens,
+    const int hidden_size,
+    const int inter_size,
+    const int num_experts,
+    cudaStream_t stream)
+{
+    // FIXME(woosuk): The MoE GEMM runner is created for each call. This is inefficient.
+    tensorrt_llm::MoeGemmRunner<T, T> moe_gemm_runner;
+    // Compute FC1
+    if (!tensorrt_llm::isGatedActivation(fc1_activation_type)) {
+        moe_gemm_runner.moeGemmBiasAct(
+            input_tokens, fc1_expert_weights, nullptr, fc1_expert_biases, fc1_output,
+            cum_num_tokens_per_expert, num_expanded_tokens, inter_size, hidden_size, num_experts,
+            fc1_activation_type, stream);
+    } else {
+        const size_t fc1_out_size = inter_size * 2;
+        // Run the GEMM with activation function overridden with `Identity`, we do the activation separately
+        moe_gemm_runner.moeGemmBiasAct(
+            input_tokens, fc1_expert_weights, nullptr, fc1_expert_biases, glu_output,
+            cum_num_tokens_per_expert, num_expanded_tokens, fc1_out_size, hidden_size, num_experts,
+            ActivationType::Identity, stream);
+        doGatedActivation<T>(
+            fc1_output, glu_output, nullptr, inter_size, num_expanded_tokens,
+            fc1_activation_type, stream);
+    }
+    // Compute FC2
+    moe_gemm_runner.moeGemm(
+        fc1_output, fc2_expert_weights, nullptr, moe_output, cum_num_tokens_per_expert,
+        num_expanded_tokens, hidden_size, inter_size, num_experts, stream);
+}
+
+} // namespace tensorrt_llm
+
+// FIXME(woosuk)
+#define LAUNCH_MOE_MLP(scalar_t, nv_t)                                                                    \
+    tensorrt_llm::run_moe_mlp<nv_t>(                                                                      \
+        (nv_t *) moe_output.data_ptr<scalar_t>(),                                                            \
+        (nv_t *) fc1_output.data_ptr<scalar_t>(),                                                            \
+        (nv_t *) glu_output.data_ptr<scalar_t>(),                                                            \
+        (nv_t *) input_tokens.data_ptr<scalar_t>(),                                                          \
+        cum_num_tokens_per_expert.data_ptr<int64_t>(),                                              \
+        (nv_t *) fc1_expert_weights.data_ptr<scalar_t>(),                                                    \
+        (nv_t *) (fc1_expert_biases.has_value() ? fc1_expert_biases.value().data_ptr<scalar_t>() : nullptr),   \
+        fc1_activation_type_enum,                                                                   \
+        (nv_t *) fc2_expert_weights.data_ptr<scalar_t>(),                                                    \
+        num_expanded_tokens,                                                                        \
+        hidden_size,                                                                                \
+        inter_size,                                                                                 \
+        num_experts,                                                                                \
+        stream);
+
+void moe_mlp(
+    torch::Tensor& moe_output,                              // [num_tokens * topk, hidden_size]
+    torch::Tensor& input_tokens,                            // [num_tokens * topk, hidden_size]
+    torch::Tensor& cum_num_tokens_per_expert,               // [num_experts]
+    torch::Tensor& fc1_expert_weights,                      // [num_experts, inter_size or 2 * inter_size, hidden_size]
+    const c10::optional<torch::Tensor>& fc1_expert_biases,  // [num_experts, inter_size]
+    int fc1_activation_type,
+    torch::Tensor& fc2_expert_weights)                      // [num_experts, hidden_size, inter_size]
+{
+    const int64_t num_expanded_tokens = input_tokens.numel() / input_tokens.size(-1);
+    const int num_experts = fc2_expert_weights.size(0);
+    const int hidden_size = fc2_expert_weights.size(1);
+    const int inter_size = fc2_expert_weights.size(2);
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(input_tokens));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    tensorrt_llm::ActivationType fc1_activation_type_enum = static_cast<tensorrt_llm::ActivationType>(fc1_activation_type);
+    torch::Tensor fc1_output = torch::empty({num_expanded_tokens, inter_size}, input_tokens.options());
+    const bool is_glu = tensorrt_llm::isGatedActivation(fc1_activation_type_enum);
+    const int64_t glu_output_size = is_glu ? num_expanded_tokens * inter_size * 2 : 0;
+    torch::Tensor glu_output = torch::empty({glu_output_size}, input_tokens.options());
+
+    auto dtype = input_tokens.dtype();
+    if (dtype == at::ScalarType::Float) {
+        LAUNCH_MOE_MLP(float, float);
+    } else if (dtype == at::ScalarType::Half) {
+        LAUNCH_MOE_MLP(at::Half, half);
+    } else if (dtype == at::ScalarType::BFloat16) {
+        LAUNCH_MOE_MLP(at::BFloat16, __nv_bfloat16);
+    } else {
+        TORCH_CHECK(false, "Unsupported data type: ", dtype);
+    }
+}
diff --git a/csrc/moe/moe_ops.cpp b/csrc/moe/moe_ops.cpp
index 35c328499a22..bf1da01895f6 100644
--- a/csrc/moe/moe_ops.cpp
+++ b/csrc/moe/moe_ops.cpp
@@ -4,4 +4,7 @@
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("topk_softmax", &topk_softmax, "Apply topk softmax to the gating outputs.");
+  m.def("expand_and_permute", &expand_and_permute, "Expand and permute the input tokens.");
+  m.def("moe_mlp", &moe_mlp, "Apply the MoE MLP.");
+  m.def("unpermute_and_reduce", &unpermute_and_reduce, "Unpermute and reduce the MoE outputs.");
 }
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index a01be3e426d7..52a346817ae9 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -7,3 +7,28 @@ void topk_softmax(
   torch::Tensor& topk_indices,
   torch::Tensor& token_expert_indices,
   torch::Tensor& gating_output);
+
+void expand_and_permute(
+  torch::Tensor& permuted_tokens,
+  torch::Tensor& cum_num_tokens_per_expert,
+  torch::Tensor& reverse_permutation_map,
+  torch::Tensor& input_tokens,
+  torch::Tensor& topk_indices,
+  torch::Tensor& token_expert_indices);
+
+void moe_mlp(
+  torch::Tensor& moe_output,
+  torch::Tensor& input_tokens,
+  torch::Tensor& cum_num_tokens_per_expert,
+  torch::Tensor& fc1_expert_weights,
+  const c10::optional<torch::Tensor>& fc1_expert_biases,
+  int fc1_activation_type,
+  torch::Tensor& fc2_expert_weights);
+
+void unpermute_and_reduce(
+  torch::Tensor& output_tokens,
+  torch::Tensor& experts_output,
+  torch::Tensor& topk_weights,
+  torch::Tensor& topk_indices,
+  torch::Tensor& reverse_permutation_map,
+  bool renormalize);
diff --git a/csrc/moe/permute_kernels.cu b/csrc/moe/permute_kernels.cu
new file mode 100644
index 000000000000..3ee66c2b230c
--- /dev/null
+++ b/csrc/moe/permute_kernels.cu
@@ -0,0 +1,243 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../dispatch_utils.h"
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <math.h>
+#include <sstream>
+
+#include <cub/cub.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/util_type.cuh>
+
+namespace vllm {
+namespace moe {
+
+// ========================== CUB Sorting things ====================================
+size_t get_workspace_size_for_radix_sort(
+    const size_t num_key_value_pairs,
+    const int num_buckets)
+{
+    size_t num_bits = (int) log2(num_buckets) + 1;
+    size_t required_storage = 0;
+    int* null_int = nullptr;
+    cub::DeviceRadixSort::SortPairs(
+        NULL, required_storage, null_int, null_int, null_int, null_int,
+        num_key_value_pairs, 0, num_bits);
+    return required_storage;
+}
+
+void radix_sort(
+    const int* keys_in,
+    int* keys_out,
+    const int* values_in,
+    int* values_out,
+    void* workspace,
+    size_t workspace_size,
+    const int num_buckets,
+    const size_t num_key_value_pairs,
+    cudaStream_t stream)
+{
+    size_t num_bits = (int) log2(num_buckets) + 1;
+    cub::DeviceRadixSort::SortPairs(
+        workspace, workspace_size, keys_in, keys_out, values_in, values_out,
+        num_key_value_pairs, 0, num_bits, stream);
+}
+
+// ============================== Infer GEMM sizes =================================
+// TODO Could linear search be better for small # experts
+__device__ inline int findTotalEltsLeqTarget(const int* sorted_indices, const int arr_length, const int target)
+{
+    int64_t low = 0, high = arr_length - 1, target_location = -1;
+    while (low <= high)
+    {
+        int64_t mid = (low + high) / 2;
+
+        if (sorted_indices[mid] > target)
+        {
+            high = mid - 1;
+        }
+        else
+        {
+            low = mid + 1;
+            target_location = mid;
+        }
+    }
+    return target_location + 1;
+}
+
+// Sets up the gemm assuming the inputs, experts and outputs are stored in row major order.
+// Assumes we want to perform output = matmul(inputs, experts) + bias
+//
+// "total_rows_before_expert" contains the index one past the last occurrence of the corresponding expert.
+// e.g. Index 0 is the start offset of expert 1, the final entry is the total number of active rows
+__global__ void computeTotalRowsBeforeExpertKernel(const int* sorted_experts, const int sorted_experts_len,
+    const int64_t num_experts, int64_t* total_rows_before_expert)
+{
+    // First, compute the global tid. We only need 1 thread per expert.
+    const int expert = blockIdx.x * blockDim.x + threadIdx.x;
+    if (expert >= num_experts)
+    {
+        return;
+    }
+
+    // This should construct the last index where each expert occurs.
+    total_rows_before_expert[expert] = findTotalEltsLeqTarget(sorted_experts, sorted_experts_len, expert);
+}
+
+void computeTotalRowsBeforeExpert(const int* sorted_indices, const int total_indices, const int num_experts,
+    int64_t* total_rows_before_expert, cudaStream_t stream)
+{
+    const int threads = std::min(1024, num_experts);
+    const int blocks = (num_experts + threads - 1) / threads;
+
+    computeTotalRowsBeforeExpertKernel<<<blocks, threads, 0, stream>>>(
+        sorted_indices, total_indices, num_experts, total_rows_before_expert);
+}
+
+
+// ========================== Permutation things =======================================
+
+// Duplicated and permutes rows for MoE. In addition, reverse the permutation map to help with finalizing routing.
+
+// "expanded_x_row" simply means that the number of values is num_rows x k. It is "expanded" since we will have to
+// duplicate some rows in the input matrix to match the dimensions. Duplicates will always get routed to separate
+// experts in the end.
+
+// Note that the expanded_dest_row_to_expanded_source_row map referred to here has indices in the range (0,
+// k*rows_in_input - 1). However, it is set up so that index 0, rows_in_input, 2*rows_in_input ... (k-1)*rows_in_input
+// all map to row 0 in the original matrix. Thus, to know where to read in the source matrix, we simply take the modulus
+// of the expanded index.
+
+template <typename T, bool CHECK_SKIPPED>
+__global__ void expandInputRowsKernel(const T* unpermuted_input, T* permuted_output,
+    const int* expanded_dest_row_to_expanded_source_row, int* expanded_source_row_to_expanded_dest_row,
+    const int num_rows, const int64_t* num_dest_rows, const int cols)
+{
+
+    // Reverse permutation map.
+    // I do this so that later, we can use the source -> dest map to do the k-way reduction and unpermuting. I need the
+    // reverse map for that reduction to allow each threadblock to do 1 k-way reduce without atomics later in MoE. 1
+    // thread block will be responsible for all k summations.
+    const int expanded_dest_row = blockIdx.x;
+    const int expanded_source_row = expanded_dest_row_to_expanded_source_row[expanded_dest_row];
+    if (threadIdx.x == 0)
+    {
+        expanded_source_row_to_expanded_dest_row[expanded_source_row] = expanded_dest_row;
+    }
+
+    if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows)
+    {
+        // Duplicate and permute rows
+        const int source_row = expanded_source_row % num_rows;
+
+        const T* source_row_ptr = unpermuted_input + source_row * cols;
+        T* dest_row_ptr = permuted_output + expanded_dest_row * cols;
+
+        for (int tid = threadIdx.x; tid < cols; tid += blockDim.x)
+        {
+            dest_row_ptr[tid] = source_row_ptr[tid];
+        }
+    }
+}
+
+template <typename T>
+void expandInputRowsKernelLauncher(
+    T* output,
+    int* reverse_permutation_map,
+    const T* input_tokens,
+    const int* sorted_token_expert_indices,
+    const int num_tokens,
+    const int hidden_size,
+    const int topk,
+    cudaStream_t stream)
+{
+    const int64_t blocks = num_tokens * topk;
+    const int threads = std::min(hidden_size, 1024);
+    expandInputRowsKernel<T, false><<<blocks, threads, 0, stream>>>(
+        input_tokens, output, sorted_token_expert_indices, reverse_permutation_map,
+        num_tokens, nullptr, hidden_size);
+}
+
+} // namespace moe
+} // namespace vllm
+
+void expand_and_permute(
+    torch::Tensor& permuted_tokens,             // [num_tokens * topk, hidden_size]
+    torch::Tensor& cum_num_tokens_per_expert,   // [num_experts]
+    torch::Tensor& reverse_permutation_map,     // [num_tokens * topk]
+    torch::Tensor& input_tokens,                // [num_tokens, hidden_size]
+    torch::Tensor& topk_indices,                // [num_tokens, topk]
+    torch::Tensor& token_expert_indices)        // [num_tokens, topk]
+{
+    const int num_experts = cum_num_tokens_per_expert.size(0);
+    const int topk = topk_indices.size(-1);
+    const int num_tokens = topk_indices.numel() / topk;
+    const int hidden_size = input_tokens.size(-1);
+
+    const size_t num_expanded_tokens = num_tokens * topk;
+    int64_t workspace_size_bytes = (int64_t) vllm::moe::get_workspace_size_for_radix_sort(
+        num_expanded_tokens, num_experts);
+    workspace_size_bytes = (workspace_size_bytes + 15) / 16 * 16;
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(input_tokens));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    torch::Tensor cub_workspace = torch::empty(
+        {workspace_size_bytes / input_tokens.element_size()}, input_tokens.options());
+    torch::Tensor sorted_topk_indices = torch::empty_like(topk_indices);
+    torch::Tensor sorted_token_expert_indices = torch::empty_like(token_expert_indices);
+
+    // Sort the token_expert_indices using topk_indices as the key
+    vllm::moe::radix_sort(
+        topk_indices.data_ptr<int>(),
+        sorted_topk_indices.data_ptr<int>(),
+        token_expert_indices.data_ptr<int>(),
+        sorted_token_expert_indices.data_ptr<int>(),
+        cub_workspace.data_ptr(),
+        workspace_size_bytes,
+        num_experts,
+        num_expanded_tokens,
+        stream);
+
+    // Compute the cumulative number of tokens per expert
+    vllm::moe::computeTotalRowsBeforeExpert(
+        sorted_topk_indices.data_ptr<int>(),
+        num_expanded_tokens,
+        num_experts,
+        cum_num_tokens_per_expert.data_ptr<int64_t>(),
+        stream);
+
+    // Expand and permute the input tokens
+    VLLM_DISPATCH_FLOATING_TYPES(
+        input_tokens.scalar_type(), "expandInputRowsKernelLauncher",
+        [&] {
+            vllm::moe::expandInputRowsKernelLauncher(
+                permuted_tokens.data_ptr<scalar_t>(),
+                reverse_permutation_map.data_ptr<int>(),
+                input_tokens.data_ptr<scalar_t>(),
+                sorted_token_expert_indices.data_ptr<int>(),
+                num_tokens,
+                hidden_size,
+                topk,
+                stream);
+        });
+}
diff --git a/csrc/moe/unpermute_kernels.cu b/csrc/moe/unpermute_kernels.cu
new file mode 100644
index 000000000000..f88f3abe198f
--- /dev/null
+++ b/csrc/moe/unpermute_kernels.cu
@@ -0,0 +1,201 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "../dispatch_utils.h"
+
+namespace vllm {
+namespace moe {
+
+enum class MOEExpertScaleNormalizationMode : int
+{
+    NONE = 0,    //!< Run the softmax on all scales and select the topk
+    RENORMALIZE, //!< Renormalize the selected scales so they sum to one. This is equivalent to only running softmax on
+                 //!< the topk selected experts
+};
+
+enum class ScaleMode : int
+{
+    NO_SCALE = 0,
+    DEFAULT = 1,
+    RENORM_SCALE = 2,
+};
+
+// Final kernel to unpermute and scale
+// This kernel unpermutes the original data, does the k-way reduction and performs the final skip connection.
+template <typename T, int RESIDUAL_NUM, bool HAS_BIAS, ScaleMode SCALE_MODE, bool CHECK_SKIPPED>
+__global__ void finalizeMoeRoutingKernel(const T* expanded_permuted_rows, T* reduced_unpermuted_output, const T* skip_1,
+    const T* skip_2, const T* bias, const float* scales, const int* expanded_source_row_to_expanded_dest_row,
+    const int* expert_for_source_row, const int cols, const int k, const int64_t* num_valid_ptr)
+{
+    const int original_row = blockIdx.x;
+    const int num_rows = gridDim.x;
+    const auto offset = original_row * cols;
+    T* reduced_row_ptr = reduced_unpermuted_output + offset;
+    const T* skip_1_row_ptr{};
+    const T* skip_2_row_ptr{};
+
+    if (RESIDUAL_NUM >= 1)
+    {
+        skip_1_row_ptr = skip_1 + offset;
+    }
+
+    if (RESIDUAL_NUM == 2)
+    {
+        skip_2_row_ptr = skip_2 + offset;
+    }
+    const int64_t num_valid = *num_valid_ptr;
+    for (int tid = threadIdx.x; tid < cols; tid += blockDim.x)
+    {
+        T thread_output{0.f};
+        float row_rescale{0.f};
+        for (int k_idx = 0; k_idx < k; ++k_idx)
+        {
+            const int expanded_original_row = original_row + k_idx * num_rows;
+            const int expanded_permuted_row = expanded_source_row_to_expanded_dest_row[expanded_original_row];
+
+            const int64_t k_offset = original_row * k + k_idx;
+            const float row_scale = (SCALE_MODE == ScaleMode::NO_SCALE) ? 1.f : scales[k_offset];
+            if constexpr (SCALE_MODE == ScaleMode::RENORM_SCALE)
+            {
+                row_rescale = row_rescale + row_scale;
+            }
+
+            // Check after row sum has accumulated
+            if (CHECK_SKIPPED && expanded_permuted_row >= num_valid)
+            {
+                continue;
+            }
+
+            const T* expanded_permuted_rows_row_ptr = expanded_permuted_rows + expanded_permuted_row * cols;
+
+            const int expert_idx = expert_for_source_row[k_offset];
+
+            const T* bias_ptr = bias + expert_idx * cols;
+            const T bias_value = HAS_BIAS ? bias_ptr[tid] : T(0.f);
+
+            thread_output = static_cast<float>(thread_output)
+                + row_scale * static_cast<float>(expanded_permuted_rows_row_ptr[tid] + bias_value);
+        }
+
+        if (SCALE_MODE == ScaleMode::RENORM_SCALE && (!CHECK_SKIPPED || thread_output))
+        {
+            assert(row_rescale != 0.f);
+            thread_output = static_cast<float>(thread_output) / row_rescale;
+        }
+
+        if (RESIDUAL_NUM == 1)
+        {
+            thread_output = thread_output + skip_1_row_ptr[tid];
+        }
+        else if (RESIDUAL_NUM == 2)
+        {
+            thread_output = thread_output + skip_1_row_ptr[tid] + skip_2_row_ptr[tid];
+        }
+        reduced_row_ptr[tid] = thread_output;
+    }
+}
+
+template <typename T, int RESIDUAL_NUM>
+void finalizeMoeRoutingKernelLauncherSelectBias(const T* expanded_permuted_rows, T* reduced_unpermuted_output,
+    const T* skip_1, const T* skip_2, const T* bias, const float* scales,
+    const int* expanded_source_row_to_expanded_dest_row, const int* expert_for_source_row, const int num_rows,
+    const int cols, const int k, const int64_t* num_valid_ptr, const bool has_bias,
+    MOEExpertScaleNormalizationMode normalization_mode, cudaStream_t stream)
+{
+    const int blocks = num_rows;
+    const int threads = std::min(cols, 1024);
+
+    const bool check_finished = num_valid_ptr != nullptr;
+
+    ScaleMode renorm_scales = ScaleMode::DEFAULT;
+    if (normalization_mode == MOEExpertScaleNormalizationMode::RENORMALIZE)
+    {
+        renorm_scales = k == 1 ? ScaleMode::NO_SCALE : ScaleMode::RENORM_SCALE;
+    }
+
+    using FuncPtr = decltype(&finalizeMoeRoutingKernel<T, RESIDUAL_NUM, false, ScaleMode::DEFAULT, false>);
+    FuncPtr func_map[2][3][2]
+        = {{
+               {&finalizeMoeRoutingKernel<T, RESIDUAL_NUM, false, ScaleMode::NO_SCALE, false>,
+                   &finalizeMoeRoutingKernel<T, RESIDUAL_NUM, true, ScaleMode::NO_SCALE, false>},
+               {&finalizeMoeRoutingKernel<T, RESIDUAL_NUM, false, ScaleMode::DEFAULT, false>,
+                   &finalizeMoeRoutingKernel<T, RESIDUAL_NUM, true, ScaleMode::DEFAULT, false>},
+               {&finalizeMoeRoutingKernel<T, RESIDUAL_NUM, false, ScaleMode::RENORM_SCALE, false>,
+                   &finalizeMoeRoutingKernel<T, RESIDUAL_NUM, true, ScaleMode::RENORM_SCALE, false>},
+           },
+            {
+                {&finalizeMoeRoutingKernel<T, RESIDUAL_NUM, false, ScaleMode::NO_SCALE, true>,
+                    &finalizeMoeRoutingKernel<T, RESIDUAL_NUM, true, ScaleMode::NO_SCALE, true>},
+                {&finalizeMoeRoutingKernel<T, RESIDUAL_NUM, false, ScaleMode::DEFAULT, true>,
+                    &finalizeMoeRoutingKernel<T, RESIDUAL_NUM, true, ScaleMode::DEFAULT, true>},
+                {&finalizeMoeRoutingKernel<T, RESIDUAL_NUM, false, ScaleMode::RENORM_SCALE, true>,
+                    &finalizeMoeRoutingKernel<T, RESIDUAL_NUM, true, ScaleMode::RENORM_SCALE, true>},
+            }};
+    auto* const func = func_map[check_finished][int(renorm_scales)][has_bias];
+    func<<<blocks, threads, 0, stream>>>(expanded_permuted_rows, reduced_unpermuted_output, skip_1, skip_2, bias,
+        scales, expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k, num_valid_ptr);
+}
+
+template <typename T>
+void finalizeMoeRoutingKernelLauncher(const T* expanded_permuted_rows, T* reduced_unpermuted_output,
+    const float* topk_weights,  const int* expanded_source_row_to_expanded_dest_row, const int* expert_for_source_row,
+    const int num_rows, const int cols, const int k, bool renormalize, cudaStream_t stream)
+{
+    const MOEExpertScaleNormalizationMode normalization_mode = renormalize ? MOEExpertScaleNormalizationMode::RENORMALIZE
+                                                                         : MOEExpertScaleNormalizationMode::NONE;
+    finalizeMoeRoutingKernelLauncherSelectBias<T, 0>(
+        expanded_permuted_rows, reduced_unpermuted_output, nullptr, nullptr, nullptr,
+        topk_weights, expanded_source_row_to_expanded_dest_row, expert_for_source_row,
+        num_rows, cols, k, nullptr, false, normalization_mode, stream);
+}
+
+} // namespace moe
+} // namespace vllm
+
+void unpermute_and_reduce(
+    torch::Tensor& output_tokens,               // [num_tokens, hidden_size]
+    torch::Tensor& experts_output,              // [num_tokens * topk, hidden_size]
+    torch::Tensor& topk_weights,                // [num_tokens, topk]
+    torch::Tensor& topk_indices,                // [num_tokens, topk]
+    torch::Tensor& reverse_permutation_map,     // [num_tokens * topk]
+    bool renormalize)
+{
+    const int hidden_size = output_tokens.size(-1);
+    const int num_tokens = output_tokens.numel() / hidden_size;
+    const int topk = topk_weights.size(-1);
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(output_tokens));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    VLLM_DISPATCH_FLOATING_TYPES(
+        experts_output.scalar_type(), "finalizeMoeRoutingKernelLauncher",
+        [&] {
+            vllm::moe::finalizeMoeRoutingKernelLauncher(
+                experts_output.data_ptr<scalar_t>(),
+                output_tokens.data_ptr<scalar_t>(),
+                topk_weights.data_ptr<float>(),
+                reverse_permutation_map.data_ptr<int>(),
+                topk_indices.data_ptr<int>(),
+                num_tokens,
+                hidden_size,
+                topk,
+                renormalize,
+                stream);
+        });
+}
diff --git a/setup.py b/setup.py
index ea58a1a49e7e..dc9dc9ab8315 100644
--- a/setup.py
+++ b/setup.py
@@ -344,14 +344,18 @@ def get_torch_arch_list() -> Set[str]:
     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
     vllm_extension_sources.append("csrc/custom_all_reduce.cu")
 
-    # Add MoE kernels.
+    abs_root_dir = os.path.abspath(ROOT_DIR)
     ext_modules.append(
         CUDAExtension(
             name="vllm._moe_C",
-            sources=glob("csrc/moe/*.cu") + glob("csrc/moe/*.cpp"),
+            sources=["csrc/cutlass_utils/cutlass_heuristic.cpp", "csrc/moe/moe_ops.cpp"] + glob("csrc/moe/*.cu"),
+            include_dirs=[
+                os.path.join(abs_root_dir, "third_party/cutlass/include/"),
+                os.path.join(abs_root_dir, "csrc/"),
+            ],
             extra_compile_args={
                 "cxx": CXX_FLAGS,
-                "nvcc": NVCC_FLAGS,
+                "nvcc": NVCC_FLAGS_PUNICA + ["-DENABLE_BF16"], # FIXME
             },
         ))
 
diff --git a/third_party/cutlass b/third_party/cutlass
new file mode 160000
index 000000000000..39c6a83f231d
--- /dev/null
+++ b/third_party/cutlass
@@ -0,0 +1 @@
+Subproject commit 39c6a83f231d6db2bc6b9c251e7add77d68cbfb4
diff --git a/vllm/model_executor/layers/fused_moe.py b/vllm/model_executor/layers/fused_moe.py
index bc3aef1887ef..4c63b836d02e 100644
--- a/vllm/model_executor/layers/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe.py
@@ -218,6 +218,7 @@ def fused_moe(
     topk: int,
     renormalize: bool,
     inplace: bool = False,
+    fused_moe_config = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism.
@@ -248,6 +249,10 @@ def fused_moe(
     M, _ = hidden_states.shape
     E, N, _ = w1.shape
 
+    if M <= 64:
+        assert inplace
+        return fused_moe_(hidden_states, w1, w2, gating_output, topk, renormalize)
+
     if is_hip():
         # The MoE kernels are not yet supported on ROCm.
         routing_weights = torch.softmax(gating_output,
@@ -279,21 +284,24 @@ def fused_moe(
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
-    config = {
-        'BLOCK_SIZE_M': 64,
-        'BLOCK_SIZE_N': 64,
-        'BLOCK_SIZE_K': 32,
-        'GROUP_SIZE_M': 8
-    }
-
-    if topk_ids.numel() <= w1.shape[0]:
+    if not fused_moe_config:
         config = {
-            'BLOCK_SIZE_M': 16,
-            'BLOCK_SIZE_N': 32,
-            'BLOCK_SIZE_K': 64,
-            'GROUP_SIZE_M': 1
+            'BLOCK_SIZE_M': 64,
+            'BLOCK_SIZE_N': 64,
+            'BLOCK_SIZE_K': 32,
+            'GROUP_SIZE_M': 8
         }
 
+        if topk_ids.numel() <= w1.shape[0]:
+            config = {
+                'BLOCK_SIZE_M': 16,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 64,
+                'GROUP_SIZE_M': 1
+            }
+    else:
+        config = fused_moe_config[min(fused_moe_config.keys(), key=lambda x: abs(x - M))]
+
     intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
@@ -325,3 +333,65 @@ def fused_moe(
                          out=hidden_states)
     return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
                      dim=1)
+
+
+import vllm._moe_C as moe_kernels
+
+def fused_moe_(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+) -> torch.Tensor:
+    num_tokens = gating_output.shape[:-1].numel()
+    num_experts = gating_output.shape[-1]
+    hidden_size = hidden_states.shape[-1]
+    dtype = hidden_states.dtype
+    device = hidden_states.device
+    # print(hidden_states.shape, w1.shape, w2.shape, gating_output.shape)
+
+    topk_weights = torch.empty(num_tokens, topk, dtype=torch.float32, device=device)
+    topk_indices = torch.empty(num_tokens, topk, dtype=torch.int32, device=device)
+    token_expert_indicies = torch.empty_like(topk_indices)
+    moe_kernels.topk_softmax(
+        topk_weights,
+        topk_indices,
+        token_expert_indicies,
+        gating_output.float(),
+    )
+
+    permuted_tokens = torch.empty(num_tokens * topk, hidden_size, dtype=dtype, device=device)
+    cum_num_tokens_per_expert = torch.empty(num_experts, dtype=torch.long, device=device)
+    reverse_permutation_map = torch.empty(num_tokens * topk, dtype=torch.int32, device=device)
+    moe_kernels.expand_and_permute(
+        permuted_tokens,
+        cum_num_tokens_per_expert,
+        reverse_permutation_map,
+        hidden_states,
+        topk_indices,
+        token_expert_indicies,
+    )
+
+    mlp_output = torch.empty_like(permuted_tokens)
+    moe_kernels.moe_mlp(
+        mlp_output,
+        permuted_tokens,
+        cum_num_tokens_per_expert,
+        w1,
+        None,
+        3,
+        w2,
+    )
+
+    output_tokens = torch.empty_like(hidden_states)
+    moe_kernels.unpermute_and_reduce(
+        output_tokens,
+        mlp_output,
+        topk_weights,
+        topk_indices,
+        reverse_permutation_map,
+        renormalize,
+    )
+    return output_tokens
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0100624a44d7..e122d35f7875 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -21,6 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
+import os
 from typing import List, Optional, Tuple
 
 import torch
@@ -78,6 +79,13 @@ def __init__(
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size // self.tp_size
 
+        self.fused_moe_config = None
+        if "VLLM_MIXTRAL_FUSE_MOE_CONFIG" in os.environ:
+            import json
+            with open(os.environ["VLLM_MIXTRAL_FUSE_MOE_CONFIG"]) as f:
+                data = json.load(f)
+            self.fused_moe_config = {int(key): val for key, val in data.items()}
+
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
@@ -133,7 +141,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                                         router_logits,
                                         self.top_k,
                                         renormalize=True,
-                                        inplace=True)
+                                        inplace=True,
+                                        fused_moe_config=self.fused_moe_config)
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(