pytorch · desertfire · Oct 14, 2025 · Oct 13, 2025
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
@@ -172,6 +172,18 @@ int32_t aoti_torch_dtype_bfloat16() {
   return 15; // PyTorch's bfloat16 dtype code
 }
 
+int32_t aoti_torch_dtype_int8() {
+  return 1; // PyTorch's int32 dtype code
+}
+
+int32_t aoti_torch_dtype_int16() {
+  return 2; // PyTorch's int32 dtype code
+}
+
+int32_t aoti_torch_dtype_int32() {
+  return 3; // PyTorch's int32 dtype code
+}
+
 int32_t aoti_torch_dtype_int64() {
   return 4; // PyTorch's int64 dtype code
 }

diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
@@ -59,6 +59,9 @@ int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_layout_strided();
 int32_t aoti_torch_dtype_float32();
 int32_t aoti_torch_dtype_bfloat16();
+int32_t aoti_torch_dtype_int8();
+int32_t aoti_torch_dtype_int16();
+int32_t aoti_torch_dtype_int32();
 int32_t aoti_torch_dtype_int64();
 
 // Autograd mode functions

diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
@@ -34,6 +34,12 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
   // Convert based on known PyTorch dtype codes (without CUDA-specific
   // dependency)
   switch (dtype) {
+    case 1: // PyTorch's int8 dtype code
+      return executorch::aten::ScalarType::Char;
+    case 2: // PyTorch's int16 dtype code
+      return executorch::aten::ScalarType::Short;
+    case 3: // PyTorch's int32 dtype code
+      return executorch::aten::ScalarType::Int;
     case 4: // PyTorch's int64 dtype code
       return executorch::aten::ScalarType::Long;
     case 6: // PyTorch's float32 dtype code

@@ -38,7 +38,7 @@ find_package_torch()
 set(_aoti_cuda_sources
     runtime/cuda_backend.cpp runtime/shims/memory.cpp
     runtime/shims/tensor_attribute.cpp runtime/guard.cpp
-    runtime/shims/cuda_guard.cpp
+    runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu
 )
 add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 target_include_directories(

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -28,7 +28,9 @@
 from torch.nn.attention import SDPBackend
 
 # exist fallback operators in et namespace;
-supported_fallback_kernels: Dict[str, Any] = {}
+supported_fallback_kernels: Dict[str, Any] = {
+    "at::_ops::_weight_int4pack_mm::call": None,
+}
 
 # required fallback kernels but not supported
 missing_fallback_kernels: Set[str] = set()

diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -1,4 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("//tools/build/buck:nvcc_flags.bzl", "get_nvcc_arch_args")
 
 oncall("executorch")
 
@@ -7,12 +8,15 @@ runtime.cxx_library(
     srcs = [
         "guard.cpp",
         "shims/cuda_guard.cpp",
+        "shims/int4mm.cu",
         "shims/memory.cpp",
         "shims/tensor_attribute.cpp",
     ],
     headers = [
         "guard.h",
         "shims/cuda_guard.h",
+        "shims/int4mm.cuh",
+        "shims/int4mm.h",
         "shims/memory.h",
         "shims/tensor_attribute.h",
         "utils.h",
@@ -30,6 +34,10 @@ runtime.cxx_library(
         "//executorch/runtime/core/exec_aten:lib",
         "//executorch/runtime/platform:platform",
     ],
+    nvcc_flags = get_nvcc_arch_args() + [
+        "-_NVCC_HOST_COMPILER_FLAG_",
+        "gcc",
+    ],
     external_deps = [
         ("cuda", None, "cuda-lazy"),
     ],

diff --git a/backends/cuda/runtime/shims/int4mm.cu b/backends/cuda/runtime/shims/int4mm.cu
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <executorch/backends/aoti/utils.h>
+#include <executorch/backends/cuda/runtime/shims/int4mm.h>
+#include <executorch/backends/cuda/runtime/shims/int4mm.cuh>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::backends::cuda {
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTITorchError aoti_torch_cuda__weight_int4pack_mm(
+    Tensor* self,
+    Tensor* mat2,
+    int64_t qGroupSize,
+    Tensor* qScaleAndZeros,
+    Tensor** ret0) {
+  // Validate input parameters first
+  // Only check for null pointers here, as the actual validation of tensor
+  // properties is done in _weight_int4pack_mm_cuda
+  ET_CHECK_OR_RETURN_ERROR(
+      self != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda__weight_int4pack_mm failed: self tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      mat2 != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda__weight_int4pack_mm failed: mat2 tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      qScaleAndZeros != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda__weight_int4pack_mm failed: qScaleAndZeros tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      ret0 != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda__weight_int4pack_mm failed: ret0 is null");
+
+  *ret0 = _weight_int4pack_mm_cuda(*self, *mat2, qGroupSize, *qScaleAndZeros);
+  ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR();
+  return Error::Ok;
+}
+
+#ifdef __cplusplus
+}
+#endif
+} // namespace executorch::backends::cuda