bitsandbytes-foundation
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 79 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 79 additions & 3 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 3 additions & 0 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bitsandbytes/backends/cuda.py‎
Lines changed: 73 additions & 24 deletions b/‎bitsandbytes/backends/cuda.py‎
Lines changed: 73 additions & 24 deletions
@@ -21,3 +21,4 @@ repos:
     rev: v1.18.2
     hooks:
       - id: typos
+        exclude: ^.*\.hip$
@@ -3,7 +3,7 @@
 #   For  GCC: `cmake -B build . && cmake --build build`
 #   For MSVC: `cmake -B build . && cmake --build build --config Release`
 # You can also use the following options and variables
-#  - COMPUTE_BACKEND: Set to `cpu`, `cuda`, or `mps` to select the backend
+#  - COMPUTE_BACKEND: Set to `cpu`, `cuda`, `hip` or `mps` to select the backend
 #  - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
 #  - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
 #                  is whatever CMake finds on your path.
@@ -26,13 +26,14 @@ endif()
 # Define included source files
 set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp)
 set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
+set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
 set(MPS_FILES csrc/mps_ops.mm)
 set(METAL_FILES csrc/mps_kernels.metal)
 # C++ sources are always included
 list(APPEND SRC_FILES ${CPP_FILES})
 
-set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)")
-set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps)
+set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
+set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
 option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
 
 if(APPLE)
@@ -49,16 +50,28 @@ if(${COMPUTE_BACKEND} STREQUAL "cuda")
     endif()
     option(NO_CUBLASLT "Disable CUBLAS" OFF)
     set(BUILD_CUDA ON)
+    set(BUILD_HIP OFF)
+    set(BUILD_MPS OFF)
+    message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")
+elseif(${COMPUTE_BACKEND} STREQUAL "hip")
+    if(APPLE)
+	message(FATAL_ERROR "HIP is not supported on macOS" )
+    endif()
+    option(NO_CUBLASLT "Disable HIPBLASLT" OFF)
+    set(BUILD_CUDA OFF)
+    set(BUILD_HIP ON)
     set(BUILD_MPS OFF)
     message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")
 elseif(${COMPUTE_BACKEND} STREQUAL "mps")
     if(NOT APPLE)
         message(FATAL_ERROR "MPS is only supported on macOS" )
     endif()
     set(BUILD_CUDA OFF)
+    set(BUILD_HIP OFF)
     set(BUILD_MPS ON)
 else()
     set(BUILD_CUDA OFF)
+    set(BUILD_HIP OFF)
     set(BUILD_MPS OFF)
 endif()
 
@@ -158,6 +171,34 @@ if(BUILD_CUDA)
         string(APPEND BNB_OUTPUT_NAME "_nocublaslt")
     endif()
     add_compile_definitions(BUILD_CUDA)
+elseif(BUILD_HIP)
+    enable_language(HIP)
+    message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}")
+    if(DEFINED BNB_ROCM_ARCH)
+      set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
+    else()
+      if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942")
+      elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+      endif()
+    endif()
+    message(STATUS "HIP Targets: ${CMAKE_HIP_ARCHITECTURES}")
+
+    list(APPEND SRC_FILES ${HIP_FILES})
+
+    string(APPEND BNB_OUTPUT_NAME "_hip")
+
+    # get hip version
+    execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
+    string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
+
+    if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
+        string(APPEND BNB_OUTPUT_NAME "_nohipblaslt")
+    endif()
+    add_compile_definitions(__HIP_PLATFORM_AMD__)
+    add_compile_definitions(__HIP_PLATFORM_HCC__)
+    add_compile_definitions(BUILD_HIP)
 elseif(BUILD_MPS)
     if(NOT APPLE)
         message(FATAL_ERROR "MPS is only supported on macOS" )
@@ -213,6 +254,41 @@ if(BUILD_CUDA)
             CUDA_SEPARABLE_COMPILATION ON
     )
 endif()
+if(BUILD_HIP)
+    if(NOT DEFINED ENV{ROCM_PATH})
+      set(ROCM_PATH /opt/rocm)
+    else()
+      set(ROCM_PATH $ENV{ROCM_PATH})
+    endif()
+    list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
+    macro(find_package_and_print_version PACKAGE_NAME)
+      find_package("${PACKAGE_NAME}" ${ARGN})
+      message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
+    endmacro()
+    find_package_and_print_version(hipblas REQUIRED)
+    find_package_and_print_version(hiprand REQUIRED)
+    find_package_and_print_version(hipsparse REQUIRED)
+
+    ## hacky way of excluding hip::amdhip64 (with it linked many tests unexpectedly fail e.g. adam8bit because of inaccuracies)
+    set_target_properties(hip::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
+    set_target_properties(hip-lang::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
+    set(CMAKE_HIP_IMPLICIT_LINK_LIBRARIES "")
+
+    target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include)
+    target_link_directories(bitsandbytes PRIVATE ${ROCM_PATH}/lib /lib)
+    target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand roc::hipsparse)
+
+    target_compile_definitions(bitsandbytes PUBLIC BNB_USE_HIP)
+    set_source_files_properties(${HIP_FILES} PROPERTIES LANGUAGE HIP)
+    set_target_properties(bitsandbytes PROPERTIES LINKER_LANGUAGE CXX)
+
+    if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
+	target_compile_definitions(bitsandbytes PUBLIC NO_HIPBLASLT)
+    else()
+	find_package(hipblaslt)
+        target_link_libraries(bitsandbytes PUBLIC roc::hipblaslt)
+    endif()
+endif()
 if(BUILD_MPS)
     add_dependencies(bitsandbytes metallib)
     target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
 
@@ -7,6 +7,7 @@
 
 import torch
 
+from bitsandbytes.cextension import BNB_HIP_VERSION
 import bitsandbytes.functional as F
 
 
@@ -222,6 +223,8 @@ def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
     if device == torch.device("cpu"):
         return True
+    if torch.version.hip:
+        return False if BNB_HIP_VERSION < 601 else True
     if torch.cuda.get_device_capability(device=device) < (7, 5):
         return False
     device_name = torch.cuda.get_device_name(device=device)
 
@@ -3,7 +3,7 @@
 
 import torch
 
-from bitsandbytes.cextension import lib
+from bitsandbytes.cextension import HIP_ENVIRONMENT, lib
 from bitsandbytes.functional import (
     CUBLAS_Context,
     coo_zeros,
@@ -14,6 +14,7 @@
     get_ptr,
     get_transform_buffer,
     is_on_gpu,
+    nvidia_transform,
     post_call,
     pre_call,
     prod,
@@ -184,6 +185,11 @@ def transform(
         state: Optional[Tuple[torch.Size, str]] = None,
         ld=None,
     ):
+        if HIP_ENVIRONMENT:
+            # transform kernel formats (col32/col_turing/col_ampere) are not applicable to ROCm
+            # Use nvidia_transform instead
+            return nvidia_transform(A, to_order, from_order, out, transpose, state, ld)
+
         prev_device = pre_call(A.device)
         if state is None:
             state = (A.shape, from_order)
@@ -266,19 +272,33 @@ def igemmlt(
             return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
 
         if dimsA == 2 and out is None:
-            out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row")
+            if HIP_ENVIRONMENT:
+                # Use col format for HIP
+                out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col", "row")
+            else:
+                out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row")
         elif dimsA == 3 and out is None:
-            out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row")
+            if HIP_ENVIRONMENT:
+                # Use col format for HIP
+                out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col", "row")
+            else:
+                out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row")
 
         assert dimsB != 3, "len(B.shape)==3 not supported"
         assert A.device.type == "cuda"
         assert B.device.type == "cuda"
         assert A.dtype == torch.int8
         assert B.dtype == torch.int8
         assert out.dtype == dtype
-        assert SA[1] == "col32"
-        assert SB[1] in ["col_turing", "col_ampere"]
-        assert Sout[1] == "col32"
+        if HIP_ENVIRONMENT:
+            # Use col format for HIP
+            assert SA[1] == "col"
+            assert SB[1] == "col"
+            assert Sout[1] == "col"
+        else:
+            assert SA[1] == "col32"
+            assert SB[1] in ["col_turing", "col_ampere"]
+            assert Sout[1] == "col32"
         assert (
             shapeA[-1] == shapeB[-1]
         ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
@@ -293,17 +313,23 @@ def igemmlt(
         ptrC = get_ptr(out)
 
         k = shapeA[-1]
-        lda = ct.c_int32(m * 32)
-        if formatB == "col_turing":
-            # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
-            # n = rows
-            ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
+        if HIP_ENVIRONMENT:
+            # Set ld values for col format
+            lda = ct.c_int32(m)
+            ldb = ct.c_int32(shapeB[0])
+            ldc = ct.c_int32(m)
         else:
-            # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
-            # n = rows
-            ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
+            lda = ct.c_int32(m * 32)
+            if formatB == "col_turing":
+                # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
+                # n = rows
+                ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
+            else:
+                # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
+                # n = rows
+                ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
 
-        ldc = ct.c_int32(m * 32)
+            ldc = ct.c_int32(m * 32)
         m = ct.c_int32(m)
         n = ct.c_int32(n)
         k = ct.c_int32(k)
@@ -312,7 +338,7 @@ def igemmlt(
         ptrRowScale = get_ptr(None)
         is_on_gpu([A, B, out])
 
-        if formatB == "col_turing":
+        if formatB == "col_turing" or HIP_ENVIRONMENT:
             if dtype == torch.int32:
                 has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
             else:
@@ -324,7 +350,7 @@ def igemmlt(
             else:
                 has_error = lib.cigemmlt_ampere_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
 
-        if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
+        if has_error == 100:  # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`, `ops.hip`
             raise NotImplementedError("igemmlt not available (probably built with NO_CUBLASLT)")
 
         if has_error:
@@ -348,6 +374,9 @@ def mm_dequant(
         new_col_stats: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
     ):
+        if HIP_ENVIRONMENT:
+            # HIP kernel requires 'row' format
+            A, quant_state = nvidia_transform(A, "row", state=quant_state)
         assert A.dtype == torch.int32
         if bias is not None:
             assert bias.dtype == torch.float16
@@ -386,7 +415,11 @@ def mm_dequant(
     def extract_outliers(self, A: torch.Tensor, SA: Tuple[torch.Size, str], idx: torch.Tensor):
         shapeA = SA[0]
         formatA = SA[1]
-        assert formatA in ["col_turing", "col_ampere"]
+        if not HIP_ENVIRONMENT:
+            assert formatA in ["col_turing", "col_ampere"]
+        else:
+            # HIP uses col format
+            assert formatA in ["col"]
         assert A.device.type == "cuda"
 
         out = torch.zeros((shapeA[0], idx.numel()), dtype=torch.int8, device=A.device)
@@ -400,7 +433,7 @@ def extract_outliers(self, A: torch.Tensor, SA: Tuple[torch.Size, str], idx: tor
 
         prev_device = pre_call(A.device)
 
-        if formatA == "col_turing":
+        if formatA == "col_turing" or HIP_ENVIRONMENT:
             lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
         elif formatA == "col_ampere":
             lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
@@ -414,11 +447,15 @@ def quantize_4bit(
         A: torch.Tensor,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
-        blocksize=64,
+        blocksize: Optional[int] = None,
         compress_statistics=False,
         quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
+        if blocksize is None:
+            # Some AMD GPUs have warpsize 64
+            # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
+            blocksize = 64 if not HIP_ENVIRONMENT else 128
         if A.device.type != "cuda":
             raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}")
         if quant_type not in ["fp4", "nf4"]:
@@ -436,7 +473,12 @@ def quantize_4bit(
             mod = dtype2bytes[quant_storage] * 2
             out = torch.zeros(((n + 1) // mod, 1), dtype=quant_storage, device=A.device)
 
-        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+        # Some AMD GPUs have warpsize 64
+        # Set min blocksize to 128 (~warpsize 64 in kernel) for HIP
+        if not HIP_ENVIRONMENT:
+            assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+        else:
+            assert blocksize in [4096, 2048, 1024, 512, 256, 128]
 
         prev_device = pre_call(A.device)
         is_on_gpu([A, out, absmax])
@@ -507,12 +549,19 @@ def dequantize_4bit(
         quant_state: Optional[QuantState] = None,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
-        blocksize: int = 64,
+        blocksize: Optional[int] = None,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
-        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
+        # Some AMD GPUs have warpsize 64
+        # Set default blocksize to 128 (~warpsize 64 in kernel) for HIP
+        if blocksize is None:
+            blocksize = 64 if not HIP_ENVIRONMENT else 128
+        supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
+        if HIP_ENVIRONMENT:
+            supported_blocksizes = supported_blocksizes[:-1]
+        if blocksize not in supported_blocksizes:
             raise ValueError(
-                f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
+                f"The blockwise of {blocksize} is not supported. Supported values: {supported_blocksizes}"
             )
 
         if quant_type not in ["fp4", "nf4"]: