From adb52c8417f543556394e714bc5a5ca34f394ac5 Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Tue, 22 Jul 2025 17:48:57 -0700 Subject: [PATCH 01/16] Vibe Start --- src/Base/Array4.H | 111 ++++++++++++++++++++++++++++++++++++++++++++-- src/Base/dlpack.h | 66 +++++++++++++++++++++++++++ 2 files changed, 174 insertions(+), 3 deletions(-) create mode 100644 src/Base/dlpack.h diff --git a/src/Base/Array4.H b/src/Base/Array4.H index a0ede4cb..96cb84f9 100644 --- a/src/Base/Array4.H +++ b/src/Base/Array4.H @@ -18,6 +18,15 @@ #include #include #include +#include "dlpack.h" + +// GPU backend headers for device detection +#ifdef AMREX_USE_CUDA +#include +#endif +#ifdef AMREX_USE_HIP +#include +#endif namespace @@ -222,13 +231,109 @@ namespace pyAMReX }) - // TODO: __dlpack__ __dlpack_device__ // DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.) // https://dmlc.github.io/dlpack/latest/ - // https://data-apis.org/array-api/latest/design_topics/data_interchange.html - // https://github.com/data-apis/consortium-feedback/issues/1 // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol + .def("__dlpack__", [](Array4 const &a4, py::handle stream = py::none()) { + // Allocate shape/strides arrays + constexpr int ndim = 4; + auto const len = length(a4); + auto *shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x}; + auto *strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1}; + // DLPack dtype + DLDataType dtype{}; + if constexpr (std::is_same_v) { dtype.code = kDLFloat; dtype.bits = 32; dtype.lanes = 1; } + else if constexpr (std::is_same_v) { dtype.code = kDLFloat; dtype.bits = 64; dtype.lanes = 1; } + else if constexpr (std::is_same_v) { dtype.code = kDLInt; dtype.bits = 32; dtype.lanes = 1; } + else if constexpr (std::is_same_v) { dtype.code = kDLInt; dtype.bits = 64; dtype.lanes = 1; } + else if constexpr (std::is_same_v) { dtype.code = kDLUInt; dtype.bits = 32; dtype.lanes = 1; } + else if constexpr (std::is_same_v) { dtype.code = kDLUInt; dtype.bits = 64; dtype.lanes = 1; } + else { throw std::runtime_error("Unsupported dtype for DLPack"); } + + // Device detection based on AMReX GPU backend + DLDevice device{ kDLCPU, 0 }; +#ifdef AMREX_USE_CUDA + // Check if data is on GPU by checking if pointer is in CUDA memory + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, a4.dataPtr()); + if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) { + device.device_type = kDLCUDA; + device.device_id = attr.device; + } +#elif defined(AMREX_USE_HIP) + // Check if data is on GPU by checking if pointer is in HIP memory + hipPointerAttribute_t attr; + hipError_t err = hipPointerGetAttributes(&attr, a4.dataPtr()); + if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { + device.device_type = kDLROCM; + device.device_id = attr.device; + } +#elif defined(AMREX_USE_DPCPP) + // For SYCL, we need to check if the data is on device + // This is more complex as SYCL doesn't have a simple pointer check + // For now, assume CPU - SYCL support would need more sophisticated detection + // device.device_type = kDLExtDev; // SYCL would use extended device type + // device.device_id = 0; +#endif + + // Construct DLTensor + auto *dl_tensor = new DLManagedTensor; + dl_tensor->dl_tensor.data = const_cast(static_cast(a4.dataPtr())); + dl_tensor->dl_tensor.device = device; + dl_tensor->dl_tensor.ndim = ndim; + dl_tensor->dl_tensor.dtype = dtype; + dl_tensor->dl_tensor.shape = shape; + dl_tensor->dl_tensor.strides = strides; + dl_tensor->dl_tensor.byte_offset = 0; + dl_tensor->manager_ctx = nullptr; + dl_tensor->deleter = [](DLManagedTensor *self) { + delete[] self->dl_tensor.shape; + delete[] self->dl_tensor.strides; + delete self; + }; + // Return as Python capsule + return py::capsule(dl_tensor, "dltensor", [](void* ptr) { + auto* tensor = static_cast(ptr); + tensor->deleter(tensor); + }); + }, py::arg("stream") = py::none(), R"doc( + DLPack protocol for zero-copy tensor exchange. + See https://dmlc.github.io/dlpack/latest/ for details. + )doc") + .def("__dlpack_device__", [](Array4 const &a4) { + // Device detection based on AMReX GPU backend + int device_type = kDLCPU; + int device_id = 0; + +#ifdef AMREX_USE_CUDA + // Check if data is on GPU by checking if pointer is in CUDA memory + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, a4.dataPtr()); + if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) { + device_type = kDLCUDA; + device_id = attr.device; + } +#elif defined(AMREX_USE_HIP) + // Check if data is on GPU by checking if pointer is in HIP memory + hipPointerAttribute_t attr; + hipError_t err = hipPointerGetAttributes(&attr, a4.dataPtr()); + if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { + device_type = kDLROCM; + device_id = attr.device; + } +#elif defined(AMREX_USE_DPCPP) + // For SYCL, we need to check if the data is on device + // This is more complex as SYCL doesn't have a simple pointer check + // For now, assume CPU - SYCL support would need more sophisticated detection + // device_type = kDLExtDev; // SYCL would use extended device type + // device_id = 0; +#endif + + return std::make_tuple(device_type, device_id); + }, R"doc( + DLPack device info (device_type, device_id). + )doc") .def("to_host", [](Array4 const & a4) { // py::tuple to std::vector diff --git a/src/Base/dlpack.h b/src/Base/dlpack.h new file mode 100644 index 00000000..812b7640 --- /dev/null +++ b/src/Base/dlpack.h @@ -0,0 +1,66 @@ +#ifndef AMREX_DLPACK_H_ +#define AMREX_DLPACK_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +// Device type codes +#define kDLCPU 1 +#define kDLCUDA 2 +#define kDLCUDAHost 3 +#define kDLOpenCL 4 +#define kDLVulkan 7 +#define kDLMetal 8 +#define kDLVPI 9 +#define kDLROCM 10 +#define kDLROCMHost 11 +#define kDLExtDev 12 + +// Data type codes +#define kDLInt 0 +#define kDLUInt 1 +#define kDLFloat 2 + +// Device context +typedef struct { + int32_t device_type; + int32_t device_id; +} DLDevice; + +// Data type +typedef struct { + uint8_t code; // kDLFloat=2, kDLInt=0, kDLUInt=1 + uint8_t bits; // number of bits, e.g., 32, 64 + uint16_t lanes; // number of lanes (for vector types) +} DLDataType; + +// Tensor structure +typedef struct { + void* data; + DLDevice device; + int32_t ndim; + int64_t* shape; + int64_t* strides; // in elements, not bytes; can be NULL for compact + uint64_t byte_offset; + DLDataType dtype; +} DLTensor; + +// Managed tensor with deleter +struct DLManagedTensor; +typedef void (*DLManagedTensorDeleter)(struct DLManagedTensor* self); + +typedef struct DLManagedTensor { + DLTensor dl_tensor; + void* manager_ctx; + DLManagedTensorDeleter deleter; +} DLManagedTensor; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AMREX_DLPACK_H_ From 32e4535b5f16ff0e73deace5f68d635bdb46b86a Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Tue, 22 Jul 2025 20:02:05 -0700 Subject: [PATCH 02/16] Cleaning --- src/Base/Array4.H | 91 ++------- src/Base/dlpack.h | 66 ------- src/dlpack.h | 466 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 479 insertions(+), 144 deletions(-) delete mode 100644 src/Base/dlpack.h create mode 100644 src/dlpack.h diff --git a/src/Base/Array4.H b/src/Base/Array4.H index 96cb84f9..a35f1c98 100644 --- a/src/Base/Array4.H +++ b/src/Base/Array4.H @@ -6,6 +6,7 @@ #pragma once #include "pyAMReX.H" +#include "dlpack.h" #include #include @@ -18,15 +19,6 @@ #include #include #include -#include "dlpack.h" - -// GPU backend headers for device detection -#ifdef AMREX_USE_CUDA -#include -#endif -#ifdef AMREX_USE_HIP -#include -#endif namespace @@ -194,6 +186,7 @@ namespace pyAMReX */ + /* // CPU: __array_interface__ v3 // https://numpy.org/doc/stable/reference/arrays.interface.html .def_property_readonly("__array_interface__", [](Array4 const & a4) { @@ -229,60 +222,26 @@ namespace pyAMReX d["version"] = 3; return d; }) + */ // DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.) // https://dmlc.github.io/dlpack/latest/ // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol - .def("__dlpack__", [](Array4 const &a4, py::handle stream = py::none()) { + .def("__dlpack__", [](Array4 const &a4, [[maybe_unused]] py::handle stream = py::none()) { // Allocate shape/strides arrays constexpr int ndim = 4; auto const len = length(a4); auto *shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x}; auto *strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1}; - // DLPack dtype - DLDataType dtype{}; - if constexpr (std::is_same_v) { dtype.code = kDLFloat; dtype.bits = 32; dtype.lanes = 1; } - else if constexpr (std::is_same_v) { dtype.code = kDLFloat; dtype.bits = 64; dtype.lanes = 1; } - else if constexpr (std::is_same_v) { dtype.code = kDLInt; dtype.bits = 32; dtype.lanes = 1; } - else if constexpr (std::is_same_v) { dtype.code = kDLInt; dtype.bits = 64; dtype.lanes = 1; } - else if constexpr (std::is_same_v) { dtype.code = kDLUInt; dtype.bits = 32; dtype.lanes = 1; } - else if constexpr (std::is_same_v) { dtype.code = kDLUInt; dtype.bits = 64; dtype.lanes = 1; } - else { throw std::runtime_error("Unsupported dtype for DLPack"); } - - // Device detection based on AMReX GPU backend - DLDevice device{ kDLCPU, 0 }; -#ifdef AMREX_USE_CUDA - // Check if data is on GPU by checking if pointer is in CUDA memory - cudaPointerAttributes attr; - cudaError_t err = cudaPointerGetAttributes(&attr, a4.dataPtr()); - if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) { - device.device_type = kDLCUDA; - device.device_id = attr.device; - } -#elif defined(AMREX_USE_HIP) - // Check if data is on GPU by checking if pointer is in HIP memory - hipPointerAttribute_t attr; - hipError_t err = hipPointerGetAttributes(&attr, a4.dataPtr()); - if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { - device.device_type = kDLROCM; - device.device_id = attr.device; - } -#elif defined(AMREX_USE_DPCPP) - // For SYCL, we need to check if the data is on device - // This is more complex as SYCL doesn't have a simple pointer check - // For now, assume CPU - SYCL support would need more sophisticated detection - // device.device_type = kDLExtDev; // SYCL would use extended device type - // device.device_id = 0; -#endif // Construct DLTensor auto *dl_tensor = new DLManagedTensor; dl_tensor->dl_tensor.data = const_cast(static_cast(a4.dataPtr())); - dl_tensor->dl_tensor.device = device; + dl_tensor->dl_tensor.device = dlpack::detect_device_from_pointer(a4.dataPtr()); dl_tensor->dl_tensor.ndim = ndim; - dl_tensor->dl_tensor.dtype = dtype; + dl_tensor->dl_tensor.dtype = dlpack::get_dlpack_dtype(); dl_tensor->dl_tensor.shape = shape; dl_tensor->dl_tensor.strides = strides; dl_tensor->dl_tensor.byte_offset = 0; @@ -297,40 +256,16 @@ namespace pyAMReX auto* tensor = static_cast(ptr); tensor->deleter(tensor); }); - }, py::arg("stream") = py::none(), R"doc( + }, + py::arg("stream") = py::none(), + R"doc( DLPack protocol for zero-copy tensor exchange. See https://dmlc.github.io/dlpack/latest/ for details. - )doc") + )doc" + ) .def("__dlpack_device__", [](Array4 const &a4) { - // Device detection based on AMReX GPU backend - int device_type = kDLCPU; - int device_id = 0; - -#ifdef AMREX_USE_CUDA - // Check if data is on GPU by checking if pointer is in CUDA memory - cudaPointerAttributes attr; - cudaError_t err = cudaPointerGetAttributes(&attr, a4.dataPtr()); - if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) { - device_type = kDLCUDA; - device_id = attr.device; - } -#elif defined(AMREX_USE_HIP) - // Check if data is on GPU by checking if pointer is in HIP memory - hipPointerAttribute_t attr; - hipError_t err = hipPointerGetAttributes(&attr, a4.dataPtr()); - if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { - device_type = kDLROCM; - device_id = attr.device; - } -#elif defined(AMREX_USE_DPCPP) - // For SYCL, we need to check if the data is on device - // This is more complex as SYCL doesn't have a simple pointer check - // For now, assume CPU - SYCL support would need more sophisticated detection - // device_type = kDLExtDev; // SYCL would use extended device type - // device_id = 0; -#endif - - return std::make_tuple(device_type, device_id); + DLDevice device = dlpack::detect_device_from_pointer(a4.dataPtr()); + return std::make_tuple(device.device_type, device.device_id); }, R"doc( DLPack device info (device_type, device_id). )doc") diff --git a/src/Base/dlpack.h b/src/Base/dlpack.h deleted file mode 100644 index 812b7640..00000000 --- a/src/Base/dlpack.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef AMREX_DLPACK_H_ -#define AMREX_DLPACK_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -// Device type codes -#define kDLCPU 1 -#define kDLCUDA 2 -#define kDLCUDAHost 3 -#define kDLOpenCL 4 -#define kDLVulkan 7 -#define kDLMetal 8 -#define kDLVPI 9 -#define kDLROCM 10 -#define kDLROCMHost 11 -#define kDLExtDev 12 - -// Data type codes -#define kDLInt 0 -#define kDLUInt 1 -#define kDLFloat 2 - -// Device context -typedef struct { - int32_t device_type; - int32_t device_id; -} DLDevice; - -// Data type -typedef struct { - uint8_t code; // kDLFloat=2, kDLInt=0, kDLUInt=1 - uint8_t bits; // number of bits, e.g., 32, 64 - uint16_t lanes; // number of lanes (for vector types) -} DLDataType; - -// Tensor structure -typedef struct { - void* data; - DLDevice device; - int32_t ndim; - int64_t* shape; - int64_t* strides; // in elements, not bytes; can be NULL for compact - uint64_t byte_offset; - DLDataType dtype; -} DLTensor; - -// Managed tensor with deleter -struct DLManagedTensor; -typedef void (*DLManagedTensorDeleter)(struct DLManagedTensor* self); - -typedef struct DLManagedTensor { - DLTensor dl_tensor; - void* manager_ctx; - DLManagedTensorDeleter deleter; -} DLManagedTensor; - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AMREX_DLPACK_H_ diff --git a/src/dlpack.h b/src/dlpack.h new file mode 100644 index 00000000..f962e270 --- /dev/null +++ b/src/dlpack.h @@ -0,0 +1,466 @@ +/*! +* Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + * + * Source: https://github.com/dmlc/dlpack/blob/v1.1/include/dlpack/dlpack.h + */ +#ifndef AMREX_DLPACK_H_ +#define AMREX_DLPACK_H_ + +#include +#include + +#include +#include + +extern "C" { + +#include +#include + +/*! + * \brief The DLPack version. + * + * A change in major version indicates that we have changed the + * data layout of the ABI - DLManagedTensorVersioned. + * + * A change in minor version indicates that we have added new + * code, such as a new device type, but the ABI is kept the same. + * + * If an obtained DLPack tensor has a major version that disagrees + * with the version number specified in this header file + * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter + * (and it is safe to do so). It is not safe to access any other fields + * as the memory layout will have changed. + * + * In the case of a minor version mismatch, the tensor can be safely used as + * long as the consumer knows how to interpret all fields. Minor version + * updates indicate the addition of enumeration values. + */ +typedef struct { + /*! \brief DLPack major version. */ + uint32_t major; + /*! \brief DLPack minor version. */ + uint32_t minor; +} DLPackVersion; + +/*! + * \brief The device type in DLDevice. + */ +typedef enum : int32_t { + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, + /*! \brief Microsoft MAIA devices */ + kDLMAIA = 17, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. + */ + int32_t device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, + /*! \brief FP8 data types */ + kDLFloat8_e3m4 = 7U, + kDLFloat8_e4m3 = 8U, + kDLFloat8_e4m3b11fnuz = 9U, + kDLFloat8_e4m3fn = 10U, + kDLFloat8_e4m3fnuz = 11U, + kDLFloat8_e5m2 = 12U, + kDLFloat8_e5m2fnuz = 13U, + kDLFloat8_e8m0fnu = 14U, + /*! \brief FP6 data types + * Setting bits != 6 is currently unspecified, and the producer must ensure it is set + * while the consumer must stop importing if the value is unexpected. + */ + kDLFloat6_e2m3fn = 15U, + kDLFloat6_e3m2fn = 16U, + /*! \brief FP4 data types + * Setting bits != 4 is currently unspecified, and the producer must ensure it is set + * while the consumer must stop importing if the value is unexpected. + */ + kDLFloat4_e2m1fn = 17U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. The data type is assumed to follow the + * native endian-ness. An explicit error message should be raised when attempting to + * export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) + * - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory) + * - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory) + * - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory) + * + * When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e., + * for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element. + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + * + * Note that if the tensor is of size zero, then the data pointer should be + * set to `NULL`. + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + * + * \note This data structure is used as Legacy DLManagedTensor + * in DLPack exchange and is deprecated after DLPack v0.8 + * Use DLManagedTensorVersioned instead. + * This data structure may get renamed or deleted in future versions. + * + * \sa DLManagedTensorVersioned + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void * manager_ctx; + /*! + * \brief Destructor - this should be called + * to destruct the manager_ctx which backs the DLManagedTensor. It can be + * NULL if there is no way for the caller to provide a reasonable destructor. + * The destructor deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor * self); +} DLManagedTensor; + +// bit masks used in in the DLManagedTensorVersioned + +/*! \brief bit mask to indicate that the tensor is read only. */ +#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) + +/*! + * \brief bit mask to indicate that the tensor is a copy made by the producer. + * + * If set, the tensor is considered solely owned throughout its lifetime by the + * consumer, until the producer-provided deleter is invoked. + */ +#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL) + +/* + * \brief bit mask to indicate that whether a sub-byte type is packed or padded. + * + * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can + * be set by the producer to signal that a tensor of sub-byte type is padded. + */ +#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL) + +/*! + * \brief A versioned and managed C Tensor object, manage memory of DLTensor. + * + * This data structure is intended to facilitate the borrowing of DLTensor by + * another framework. It is not meant to transfer the tensor. When the borrowing + * framework doesn't need the tensor, it should call the deleter to notify the + * host that the resource is no longer needed. + * + * \note This is the current standard DLPack exchange data structure. + */ +struct DLManagedTensorVersioned { + /*! + * \brief The API and ABI version of the current managed Tensor + */ + DLPackVersion version; + /*! + * \brief the context of the original host framework. + * + * Stores DLManagedTensorVersioned is used in the + * framework. It can also be NULL. + */ + void *manager_ctx; + /*! + * \brief Destructor. + * + * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned. + * It can be NULL if there is no way for the caller to provide a reasonable + * destructor. The destructor deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensorVersioned *self); + /*! + * \brief Additional bitmask flags information about the tensor. + * + * By default the flags should be set to 0. + * + * \note Future ABI changes should keep everything until this field + * stable, to ensure that deleter can be correctly called. + * + * \sa DLPACK_FLAG_BITMASK_READ_ONLY + * \sa DLPACK_FLAG_BITMASK_IS_COPIED + */ + uint64_t flags; + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; +}; + +} // extern "C" + +namespace pyAMReX::dlpack +{ + + template + AMREX_INLINE + DLDataType get_dlpack_dtype () + { + DLDataType dtype{}; + + if constexpr (std::is_same_v) { + dtype.code = kDLFloat; + dtype.bits = 32; + dtype.lanes = 1; + } + else if constexpr (std::is_same_v) { + dtype.code = kDLFloat; + dtype.bits = 64; + dtype.lanes = 1; + } + else if constexpr (std::is_same_v) { + dtype.code = kDLInt; + dtype.bits = 32; + dtype.lanes = 1; + } + else if constexpr (std::is_same_v) { + dtype.code = kDLInt; + dtype.bits = 64; + dtype.lanes = 1; + } + else if constexpr (std::is_same_v) { + dtype.code = kDLUInt; + dtype.bits = 32; + dtype.lanes = 1; + } + else if constexpr (std::is_same_v) { + dtype.code = kDLUInt; + dtype.bits = 64; + dtype.lanes = 1; + } + else { + throw std::runtime_error("Unsupported dtype for DLPack"); + } + + return dtype; + } + + AMREX_INLINE + DLDevice detect_device_from_pointer ([[maybe_unused]] const void* ptr) + { + DLDevice device{ kDLCPU, 0 }; + +#ifdef AMREX_USE_CUDA + // Check if data is on GPU by checking if pointer is in CUDA memory + // note: cudaPointerGetAttributes is quite expensive, remove and + // assume device-side if need be. + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, ptr); + if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) { + device.device_type = kDLCUDA; + device.device_id = attr.device; + } +#elif defined(AMREX_USE_HIP) + // Check if data is on GPU by checking if pointer is in HIP memory + // note: hipPointerGetAttributes is quite expensive, remove and + // assume device-side if need be. + hipPointerAttribute_t attr; + hipError_t err = hipPointerGetAttributes(&attr, ptr); + if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) { + device.device_type = kDLROCM; + device.device_id = attr.device; + } + +#elif defined(AMREX_USE_DPCPP) + // try { + // Get the SYCL context and queue from AMReX + auto const& queue = amrex::Gpu::Device::streamQueue(); + auto const& context = queue.get_context(); + + // Try to get pointer attributes using SYCL USM queries + auto usm_type = sycl::get_pointer_type(ptr, context); + + if (usm_type == sycl::usm::alloc::device || + usm_type == sycl::usm::alloc::shared) { + device.device_type = kDLOneAPI; + + // Try to get the actual device from the pointer + try { + auto device_ptr = sycl::get_pointer_device(ptr, context); + device.device_id = 0; // Default to first device + + auto devices = context.get_devices(); + for (size_t i = 0; i < devices.size(); ++i) { + if (devices[i] == device_ptr) { + device.device_id = static_cast(i); + break; + } + } + } catch (const sycl::exception&) { + // If we can't determine the specific device, default to 0 + device.device_id = 0; + } + } else if (usm_type == sycl::usm::alloc::host) { + // Host USM allocation - still oneAPI but accessible from host + device.device_type = kDLOneAPI; + device.device_id = 0; + } + // If usm_type is sycl::usm::alloc::unknown, it might be regular CPU memory + // In that case, we keep the default CPU device type set above + + /* + } + catch (const sycl::exception&) { + // If SYCL queries fail, assume it's regular CPU memory + // device remains as kDLCPU, 0 (set at function start) + } catch (...) { + // Handle any other exceptions gracefully + // device remains as kDLCPU, 0 (set at function start) + } + */ +#endif + + return device; + } + +} // namespace pyAMReX::dlpack + +#endif // AMREX_DLPACK_H_ From 0349655b0194bdd397ea98b66edfa135eac1de04 Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Tue, 22 Jul 2025 22:38:07 -0700 Subject: [PATCH 03/16] More Cleanup DLPack 1.1, e.g., in NumPy 2.1+ Tests do not yet pass. --- src/Base/Array4.H | 67 +++++++++++++++++++++++++++++--------------- src/dlpack.h | 49 +++++++------------------------- tests/test_array4.py | 14 +++++---- 3 files changed, 63 insertions(+), 67 deletions(-) diff --git a/src/Base/Array4.H b/src/Base/Array4.H index a35f1c98..c6285eb4 100644 --- a/src/Base/Array4.H +++ b/src/Base/Array4.H @@ -225,39 +225,60 @@ namespace pyAMReX */ - // DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.) + // DLPack v1.1 protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.) // https://dmlc.github.io/dlpack/latest/ // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol - .def("__dlpack__", [](Array4 const &a4, [[maybe_unused]] py::handle stream = py::none()) { + .def("__dlpack__", []( + Array4 const &a4 + /* TODO: + [[maybe_unused]] py::handle stream, + [[maybe_unused]] std::tuple max_version, + [[maybe_unused]] std::tuple dl_device, + [[maybe_unused]] bool copy + */ + ) + { // Allocate shape/strides arrays constexpr int ndim = 4; auto const len = length(a4); - auto *shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x}; - auto *strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1}; - - // Construct DLTensor - auto *dl_tensor = new DLManagedTensor; - dl_tensor->dl_tensor.data = const_cast(static_cast(a4.dataPtr())); - dl_tensor->dl_tensor.device = dlpack::detect_device_from_pointer(a4.dataPtr()); - dl_tensor->dl_tensor.ndim = ndim; - dl_tensor->dl_tensor.dtype = dlpack::get_dlpack_dtype(); - dl_tensor->dl_tensor.shape = shape; - dl_tensor->dl_tensor.strides = strides; - dl_tensor->dl_tensor.byte_offset = 0; - dl_tensor->manager_ctx = nullptr; - dl_tensor->deleter = [](DLManagedTensor *self) { + + // Construct DLManagedTensorVersioned (DLPack 1.1 standard) + auto *dl_mgt_tensor = new DLManagedTensorVersioned; + dl_mgt_tensor->version = DLPackVersion{}; + dl_mgt_tensor->flags = 0; // No special flags + dl_mgt_tensor->dl_tensor.data = const_cast(static_cast(a4.dataPtr())); + dl_mgt_tensor->dl_tensor.device = dlpack::detect_device_from_pointer(a4.dataPtr()); + dl_mgt_tensor->dl_tensor.ndim = ndim; + dl_mgt_tensor->dl_tensor.dtype = dlpack::get_dlpack_dtype(); + dl_mgt_tensor->dl_tensor.shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x}; + dl_mgt_tensor->dl_tensor.strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1}; + dl_mgt_tensor->dl_tensor.byte_offset = 0; + dl_mgt_tensor->manager_ctx = nullptr; // TODO: we can increase/decrease the Python ref counter of the producer here + dl_mgt_tensor->deleter = [](DLManagedTensorVersioned *self) { delete[] self->dl_tensor.shape; delete[] self->dl_tensor.strides; delete self; }; // Return as Python capsule - return py::capsule(dl_tensor, "dltensor", [](void* ptr) { - auto* tensor = static_cast(ptr); - tensor->deleter(tensor); - }); + return py::capsule( + dl_mgt_tensor, + "dltensor_versioned", + /*[](void* ptr) { + auto* tensor = static_cast(ptr); + tensor->deleter(tensor); + }*/ + [](PyObject *capsule) + { + auto *p = static_cast( + PyCapsule_GetPointer(capsule, "dltensor_versioned")); + if (p && p->deleter) + p->deleter(p); + } + ); }, - py::arg("stream") = py::none(), + //py::arg("stream") = py::none(), + // ... other args & their defaults R"doc( DLPack protocol for zero-copy tensor exchange. See https://dmlc.github.io/dlpack/latest/ for details. @@ -265,11 +286,13 @@ namespace pyAMReX ) .def("__dlpack_device__", [](Array4 const &a4) { DLDevice device = dlpack::detect_device_from_pointer(a4.dataPtr()); - return std::make_tuple(device.device_type, device.device_id); + return std::make_tuple(static_cast(device.device_type), device.device_id); }, R"doc( DLPack device info (device_type, device_id). )doc") + + .def("to_host", [](Array4 const & a4) { // py::tuple to std::vector auto const a4i = pyAMReX::array_interface(a4); diff --git a/src/dlpack.h b/src/dlpack.h index f962e270..4dd84dee 100644 --- a/src/dlpack.h +++ b/src/dlpack.h @@ -40,9 +40,9 @@ extern "C" { */ typedef struct { /*! \brief DLPack major version. */ - uint32_t major; + uint32_t major = 1; /*! \brief DLPack minor version. */ - uint32_t minor; + uint32_t minor = 1; } DLPackVersion; /*! @@ -238,37 +238,7 @@ typedef struct { uint64_t byte_offset; } DLTensor; -/*! - * \brief C Tensor object, manage memory of DLTensor. This data structure is - * intended to facilitate the borrowing of DLTensor by another framework. It is - * not meant to transfer the tensor. When the borrowing framework doesn't need - * the tensor, it should call the deleter to notify the host that the resource - * is no longer needed. - * - * \note This data structure is used as Legacy DLManagedTensor - * in DLPack exchange and is deprecated after DLPack v0.8 - * Use DLManagedTensorVersioned instead. - * This data structure may get renamed or deleted in future versions. - * - * \sa DLManagedTensorVersioned - */ -typedef struct DLManagedTensor { - /*! \brief DLTensor which is being memory managed */ - DLTensor dl_tensor; - /*! \brief the context of the original host framework of DLManagedTensor in - * which DLManagedTensor is used in the framework. It can also be NULL. - */ - void * manager_ctx; - /*! - * \brief Destructor - this should be called - * to destruct the manager_ctx which backs the DLManagedTensor. It can be - * NULL if there is no way for the caller to provide a reasonable destructor. - * The destructor deletes the argument self as well. - */ - void (*deleter)(struct DLManagedTensor * self); -} DLManagedTensor; - -// bit masks used in in the DLManagedTensorVersioned +// bit masks used in the DLManagedTensorVersioned /*! \brief bit mask to indicate that the tensor is read only. */ #define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) @@ -344,34 +314,35 @@ namespace pyAMReX::dlpack AMREX_INLINE DLDataType get_dlpack_dtype () { + using V = std::decay_t; DLDataType dtype{}; - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { dtype.code = kDLFloat; dtype.bits = 32; dtype.lanes = 1; } - else if constexpr (std::is_same_v) { + else if constexpr (std::is_same_v) { dtype.code = kDLFloat; dtype.bits = 64; dtype.lanes = 1; } - else if constexpr (std::is_same_v) { + else if constexpr (std::is_same_v) { dtype.code = kDLInt; dtype.bits = 32; dtype.lanes = 1; } - else if constexpr (std::is_same_v) { + else if constexpr (std::is_same_v) { dtype.code = kDLInt; dtype.bits = 64; dtype.lanes = 1; } - else if constexpr (std::is_same_v) { + else if constexpr (std::is_same_v) { dtype.code = kDLUInt; dtype.bits = 32; dtype.lanes = 1; } - else if constexpr (std::is_same_v) { + else if constexpr (std::is_same_v) { dtype.code = kDLUInt; dtype.bits = 64; dtype.lanes = 1; diff --git a/tests/test_array4.py b/tests/test_array4.py index 99c37d12..44570837 100644 --- a/tests/test_array4.py +++ b/tests/test_array4.py @@ -31,7 +31,9 @@ def test_array4(): ) print(f"\nx: {x.__array_interface__} {x.dtype}") arr = amr.Array4_double(x) - print(f"arr: {arr.__array_interface__}") + print(f"arr: DLPack device info: {arr.__dlpack_device__()}") + # print(f"arr: DLPack: {arr.__dlpack__()}") + print(f"x.shape: {x.shape}") print(arr) assert arr.nComp == 1 @@ -44,16 +46,16 @@ def test_array4(): assert arr[0, 0, 0] == 1 assert arr[3, 2, 1] == 1 - # copy to numpy - c_arr2np = np.array(arr, copy=True) # segfaults on Windows + # copy to numpy using DLPack + c_arr2np = np.from_dlpack(arr) assert c_arr2np.ndim == 4 assert c_arr2np.dtype == np.dtype("double") print(f"c_arr2np: {c_arr2np.__array_interface__}") np.testing.assert_array_equal(x, c_arr2np[0, :, :, :]) assert c_arr2np[0, 1, 1, 1] == 42 - # view to numpy - v_arr2np = np.array(arr, copy=False) + # view to numpy using DLPack + v_arr2np = np.from_dlpack(arr) assert c_arr2np.ndim == 4 assert v_arr2np.dtype == np.dtype("double") np.testing.assert_array_equal(x, v_arr2np[0, :, :, :]) @@ -65,7 +67,7 @@ def test_array4(): # copy array4 (view) c_arr = amr.Array4_double(arr) - v_carr2np = np.array(c_arr, copy=False) + v_carr2np = np.from_dlpack(c_arr) x[1, 1, 1] = 44 assert v_carr2np[0, 1, 1, 1] == 44 From 181145dc184787e80d3d4092dceda3887e825225 Mon Sep 17 00:00:00 2001 From: Roelof Groenewald Date: Wed, 23 Jul 2025 17:18:34 -0700 Subject: [PATCH 04/16] early return in `PyCapsule_Destructor` if the capsule name is "used_dltensor_versioned" Signed-off-by: Roelof Groenewald --- src/Base/Array4.H | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Base/Array4.H b/src/Base/Array4.H index c6285eb4..66f6e488 100644 --- a/src/Base/Array4.H +++ b/src/Base/Array4.H @@ -270,6 +270,9 @@ namespace pyAMReX }*/ [](PyObject *capsule) { + if (PyCapsule_IsValid(capsule, "used_dltensor_versioned")) { + return; /* Do nothing if the capsule has been consumed. */ + } auto *p = static_cast( PyCapsule_GetPointer(capsule, "dltensor_versioned")); if (p && p->deleter) @@ -292,7 +295,6 @@ namespace pyAMReX )doc") - .def("to_host", [](Array4 const & a4) { // py::tuple to std::vector auto const a4i = pyAMReX::array_interface(a4); From 81929d7aee12a969d704b7588eb13b1804c31b09 Mon Sep 17 00:00:00 2001 From: Roelof Groenewald Date: Thu, 24 Jul 2025 22:55:05 -0700 Subject: [PATCH 05/16] accept keyword arguments in Array4.__dlpack__ and fix Cuda memory type check Signed-off-by: Roelof Groenewald --- src/Base/Array4.H | 5 +++-- src/dlpack.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Base/Array4.H b/src/Base/Array4.H index 66f6e488..bf3979ca 100644 --- a/src/Base/Array4.H +++ b/src/Base/Array4.H @@ -230,8 +230,9 @@ namespace pyAMReX // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol .def("__dlpack__", []( - Array4 const &a4 - /* TODO: + Array4 const &a4, + py::kwargs kwargs + /* TODO: Handle keyword arguments [[maybe_unused]] py::handle stream, [[maybe_unused]] std::tuple max_version, [[maybe_unused]] std::tuple dl_device, diff --git a/src/dlpack.h b/src/dlpack.h index 4dd84dee..d9054876 100644 --- a/src/dlpack.h +++ b/src/dlpack.h @@ -365,7 +365,7 @@ namespace pyAMReX::dlpack // assume device-side if need be. cudaPointerAttributes attr; cudaError_t err = cudaPointerGetAttributes(&attr, ptr); - if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) { + if (err == cudaSuccess && attr.type == cudaMemoryTypeDevice) { device.device_type = kDLCUDA; device.device_id = attr.device; } From b04171ca649f9a00e6386891feed523ac6ea07bf Mon Sep 17 00:00:00 2001 From: Roelof Groenewald Date: Fri, 25 Jul 2025 23:05:08 -0700 Subject: [PATCH 06/16] add `array4_to_dpnp` and modify `array4_to_xp` logic to distinguish GPU backends Signed-off-by: Roelof Groenewald --- src/amrex/extensions/Array4.py | 62 ++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/src/amrex/extensions/Array4.py b/src/amrex/extensions/Array4.py index 8cd73f8a..6bf673b6 100644 --- a/src/amrex/extensions/Array4.py +++ b/src/amrex/extensions/Array4.py @@ -1,7 +1,7 @@ """ This file is part of pyAMReX -Copyright 2023 AMReX community +Copyright 2023-2025 AMReX community Authors: Axel Huebl License: BSD-3-Clause-LBNL """ @@ -92,9 +92,52 @@ def array4_to_cupy(self, copy=False, order="F"): raise ValueError("The order argument must be F or C.") +def array4_to_dpnp(self, copy=False, order="F"): + """ + Provide a dpnp view into an Array4. + + This includes ngrow guard cells of the box. + + Note on the order of indices: + By default, this is as in AMReX in Fortran contiguous order, indexing as + x,y,z. This has performance implications for use in external libraries such + as dpnp. + The order="C" option will index as z,y,x and may perform better. + https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074 + + Parameters + ---------- + self : amrex.Array4_* + An Array4 class in pyAMReX + copy : bool, optional + Copy the data if true, otherwise create a view (default). + order : string, optional + F order (default) or C. C is faster with external libraries. + + Returns + ------- + dpnp.array + A dpnp n-dimensional array. + + Raises + ------ + ImportError + Raises an exception if dpnp is not installed + """ + import dpnp as dp + + if order == "F": + return dp.from_dlpack(self, copy=copy).T + elif order == "C": + return dp.from_dlpack(self, copy=copy) + else: + raise ValueError("The order argument must be F or C.") + + def array4_to_xp(self, copy=False, order="F"): """ - Provide a NumPy or CuPy view into an Array4, depending on amr.Config.have_gpu . + Provide a NumPy, CuPy or dpnp view into an Array4, depending on amr.Config.have_gpu + and amr.Config.gpu_backend . This function is similar to CuPy's xp naming suggestion for CPU/GPU agnostic code: https://docs.cupy.dev/en/stable/user_guide/basic.html#how-to-write-cpu-gpu-agnostic-code @@ -120,14 +163,20 @@ def array4_to_xp(self, copy=False, order="F"): Returns ------- xp.array - A NumPy or CuPy n-dimensional array. + A NumPy, CuPy or dpnp n-dimensional array. """ import inspect amr = inspect.getmodule(self) - return ( - self.to_cupy(copy, order) if amr.Config.have_gpu else self.to_numpy(copy, order) - ) + + if amr.Config.have_gpu: + if amr.Config.gpu_backend == 'SYCL': + return self.to_dpnp(copy, order) + else: # if not SYCL use cupy + return self.to_cupy(copy, order) + + # if no GPU, use NumPy + return self.to_numpy(copy, order) def register_Array4_extension(amr): @@ -144,4 +193,5 @@ def register_Array4_extension(amr): ): Array4_type.to_numpy = array4_to_numpy Array4_type.to_cupy = array4_to_cupy + Array4_type.to_dpnp = array4_to_dpnp Array4_type.to_xp = array4_to_xp From ccce85c8edac2a8346b4590db0a2c663aba2fe6c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 26 Jul 2025 06:06:41 +0000 Subject: [PATCH 07/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/amrex/extensions/Array4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amrex/extensions/Array4.py b/src/amrex/extensions/Array4.py index 6bf673b6..64b06fab 100644 --- a/src/amrex/extensions/Array4.py +++ b/src/amrex/extensions/Array4.py @@ -170,7 +170,7 @@ def array4_to_xp(self, copy=False, order="F"): amr = inspect.getmodule(self) if amr.Config.have_gpu: - if amr.Config.gpu_backend == 'SYCL': + if amr.Config.gpu_backend == "SYCL": return self.to_dpnp(copy, order) else: # if not SYCL use cupy return self.to_cupy(copy, order) From 25d3b348730bd517eba05354775c2a7557ebad33 Mon Sep 17 00:00:00 2001 From: Roelof Groenewald Date: Sat, 26 Jul 2025 00:14:03 -0700 Subject: [PATCH 08/16] update `MultiFab.py` to also support SYCL backend Signed-off-by: Roelof Groenewald --- src/amrex/extensions/MultiFab.py | 80 ++++++++++++++++++++++++++------ src/dlpack.h | 2 +- 2 files changed, 68 insertions(+), 14 deletions(-) diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py index 61b5d159..0fe7c214 100644 --- a/src/amrex/extensions/MultiFab.py +++ b/src/amrex/extensions/MultiFab.py @@ -103,10 +103,50 @@ def mf_to_cupy(self, copy=False, order="F"): return views +def mf_to_dpnp(self, copy=False, order="F"): + """ + Provide a dpnp view into a MultiFab. + + This includes ngrow guard cells of each box. + + Note on the order of indices: + By default, this is as in AMReX in Fortran contiguous order, indexing as + x,y,z. This has performance implications for use in external libraries such + as cupy. + The order="C" option will index as z,y,x and perform may better. + https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074 + + Parameters + ---------- + self : amrex.MultiFab + A MultiFab class in pyAMReX + copy : bool, optional + Copy the data if true, otherwise create a view (default). + order : string, optional + F order (default) or C. C is faster with external libraries. + + Returns + ------- + list of dpnp.array + A list of dpnp n-dimensional arrays, for each local block in the + MultiFab. + + Raises + ------ + ImportError + Raises an exception if dpnp is not installed + """ + views = [] + for mfi in self: + views.append(self.array(mfi).to_dpnp(copy, order)) + + return views + + def mf_to_xp(self, copy=False, order="F"): """ - Provide a NumPy or CuPy view into a MultiFab, - depending on amr.Config.have_gpu . + Provide a NumPy, CuPy or dpnp view into a MultiFab, + depending on amr.Config.have_gpu and amr.Config.gpu_backend . This function is similar to CuPy's xp naming suggestion for CPU/GPU agnostic code: https://docs.cupy.dev/en/stable/user_guide/basic.html#how-to-write-cpu-gpu-agnostic-code @@ -132,15 +172,21 @@ def mf_to_xp(self, copy=False, order="F"): Returns ------- list of xp.array - A list of NumPy or CuPy n-dimensional arrays, for each local block in the - MultiFab. + A list of NumPy, CuPy or dpnp n-dimensional arrays, for each local block + in the MultiFab. """ import inspect amr = inspect.getmodule(self) - return ( - self.to_cupy(copy, order) if amr.Config.have_gpu else self.to_numpy(copy, order) - ) + + if amr.Config.have_gpu: + if amr.Config.gpu_backend == "SYCL": + return self.to_dpnp(copy, order) + else: # if not SYCL use cupy + return self.to_cupy(copy, order) + + # if no GPU, use NumPy + return self.to_numpy(copy, order) def copy_multifab(amr, self): @@ -490,6 +536,9 @@ def __getitem__(self, index, with_internal_ghosts=False): Whether to include internal ghost cells. When true, data from ghost cells may be used that overlaps valid cells. """ + import inspect + amr = inspect.getmodule(self) + index4 = _process_index(self, index) # Gather the data to be included in a list to be sent to other processes @@ -503,17 +552,18 @@ def __getitem__(self, index, with_internal_ghosts=False): device_arr = _get_field(self, mfi) slice_arr = device_arr[block_slices] try: - # Copy data from host to device using cupy syntax - slice_arr = slice_arr.get() + if amr.Config.gpu_backend == "SYCL": + import dpnp + slice_arr = dpnp.asnumpy(slice_arr) + else: + # Copy data from host to device using cupy syntax + slice_arr = slice_arr.get() except AttributeError: # Array is already a numpy array on the host pass datalist.append((global_slices, slice_arr)) # Gather the data from all processors - import inspect - - amr = inspect.getmodule(self) if amr.Config.have_mpi: npes = amr.ParallelDescriptor.NProcs() else: @@ -604,7 +654,10 @@ def __setitem__(self, index, value): amr = inspect.getmodule(self) if amr.Config.have_gpu: - import cupy as xp + if amr.Config.gpu_backend == "SYCL": + import dpnp as xp + else: + import cupy as xp else: xp = np @@ -654,6 +707,7 @@ def register_MultiFab_extension(amr): amr.MultiFab.to_numpy = mf_to_numpy amr.MultiFab.to_cupy = mf_to_cupy + amr.MultiFab.to_dpnp = mf_to_dpnp amr.MultiFab.to_xp = mf_to_xp amr.MultiFab.copy = lambda self: copy_multifab(amr, self) diff --git a/src/dlpack.h b/src/dlpack.h index d9054876..e25fb19f 100644 --- a/src/dlpack.h +++ b/src/dlpack.h @@ -197,7 +197,7 @@ typedef struct { * types. This pointer is always aligned to 256 bytes as in CUDA. The * `byte_offset` field should be used to point to the beginning of the data. * - * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch, TensorFlow, * TVM, perhaps others) do not adhere to this 256 byte aligment requirement * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed * (after which this note will be updated); at the moment it is recommended From 228f940e557483df30e8c6b88bb332503d4d0fb8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 26 Jul 2025 07:14:19 +0000 Subject: [PATCH 09/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/amrex/extensions/MultiFab.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py index 0fe7c214..776cc5d3 100644 --- a/src/amrex/extensions/MultiFab.py +++ b/src/amrex/extensions/MultiFab.py @@ -537,6 +537,7 @@ def __getitem__(self, index, with_internal_ghosts=False): overlaps valid cells. """ import inspect + amr = inspect.getmodule(self) index4 = _process_index(self, index) @@ -554,6 +555,7 @@ def __getitem__(self, index, with_internal_ghosts=False): try: if amr.Config.gpu_backend == "SYCL": import dpnp + slice_arr = dpnp.asnumpy(slice_arr) else: # Copy data from host to device using cupy syntax From e700c25bd02d5ec8c06402d7ab023f3bed5d2677 Mon Sep 17 00:00:00 2001 From: Roelof Groenewald Date: Sat, 26 Jul 2025 09:33:00 -0700 Subject: [PATCH 10/16] use `array4_to_xp` in `MultiFab.py` to avoid needing `mf_to_cupy` and `mf_to_dpnp` Signed-off-by: Roelof Groenewald --- src/amrex/extensions/MultiFab.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py index 776cc5d3..9d0d1d90 100644 --- a/src/amrex/extensions/MultiFab.py +++ b/src/amrex/extensions/MultiFab.py @@ -175,18 +175,11 @@ def mf_to_xp(self, copy=False, order="F"): A list of NumPy, CuPy or dpnp n-dimensional arrays, for each local block in the MultiFab. """ - import inspect - - amr = inspect.getmodule(self) - - if amr.Config.have_gpu: - if amr.Config.gpu_backend == "SYCL": - return self.to_dpnp(copy, order) - else: # if not SYCL use cupy - return self.to_cupy(copy, order) + views = [] + for mfi in self: + views.append(self.array(mfi).to_xp(copy, order)) - # if no GPU, use NumPy - return self.to_numpy(copy, order) + return views def copy_multifab(amr, self): From 13f68c9d8cb41b4ea379dbd5ee6118edd244c343 Mon Sep 17 00:00:00 2001 From: Roelof Groenewald Date: Sat, 26 Jul 2025 11:27:34 -0700 Subject: [PATCH 11/16] add specific optional arguments to `Array4.__dlpack__` Signed-off-by: Roelof Groenewald --- src/Base/Array4.H | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/Base/Array4.H b/src/Base/Array4.H index bf3979ca..91bc7f07 100644 --- a/src/Base/Array4.H +++ b/src/Base/Array4.H @@ -231,13 +231,12 @@ namespace pyAMReX // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol .def("__dlpack__", []( Array4 const &a4, - py::kwargs kwargs - /* TODO: Handle keyword arguments - [[maybe_unused]] py::handle stream, - [[maybe_unused]] std::tuple max_version, - [[maybe_unused]] std::tuple dl_device, - [[maybe_unused]] bool copy - */ + /* TODO: Handle keyword arguments */ + [[maybe_unused]] std::optional stream = std::nullopt, + [[maybe_unused]] std::optional> max_version = std::nullopt, + [[maybe_unused]] std::optional> dl_device = std::nullopt, + [[maybe_unused]] std::optional copy = std::nullopt + ) { // Allocate shape/strides arrays @@ -281,8 +280,10 @@ namespace pyAMReX } ); }, - //py::arg("stream") = py::none(), - // ... other args & their defaults + py::arg("stream") = py::none(), + py::arg("max_version") = py::none(), + py::arg("dl_device") = py::none(), + py::arg("copy") = py::none(), R"doc( DLPack protocol for zero-copy tensor exchange. See https://dmlc.github.io/dlpack/latest/ for details. From 728fa4a986ba938c307b25ed28f8e394c2ac23bb Mon Sep 17 00:00:00 2001 From: Roelof Groenewald Date: Sun, 27 Jul 2025 17:46:19 -0700 Subject: [PATCH 12/16] remove `mf_to_cupy` and `mf_to_dpnp` Signed-off-by: Roelof Groenewald --- src/amrex/extensions/MultiFab.py | 83 -------------------------------- 1 file changed, 83 deletions(-) diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py index 9d0d1d90..dea96244 100644 --- a/src/amrex/extensions/MultiFab.py +++ b/src/amrex/extensions/MultiFab.py @@ -63,86 +63,6 @@ def mf_to_numpy(self, copy=False, order="F"): return views -def mf_to_cupy(self, copy=False, order="F"): - """ - Provide a CuPy view into a MultiFab. - - This includes ngrow guard cells of each box. - - Note on the order of indices: - By default, this is as in AMReX in Fortran contiguous order, indexing as - x,y,z. This has performance implications for use in external libraries such - as cupy. - The order="C" option will index as z,y,x and perform better with cupy. - https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074 - - Parameters - ---------- - self : amrex.MultiFab - A MultiFab class in pyAMReX - copy : bool, optional - Copy the data if true, otherwise create a view (default). - order : string, optional - F order (default) or C. C is faster with external libraries. - - Returns - ------- - list of cupy.array - A list of CuPy n-dimensional arrays, for each local block in the - MultiFab. - - Raises - ------ - ImportError - Raises an exception if cupy is not installed - """ - views = [] - for mfi in self: - views.append(self.array(mfi).to_cupy(copy, order)) - - return views - - -def mf_to_dpnp(self, copy=False, order="F"): - """ - Provide a dpnp view into a MultiFab. - - This includes ngrow guard cells of each box. - - Note on the order of indices: - By default, this is as in AMReX in Fortran contiguous order, indexing as - x,y,z. This has performance implications for use in external libraries such - as cupy. - The order="C" option will index as z,y,x and perform may better. - https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074 - - Parameters - ---------- - self : amrex.MultiFab - A MultiFab class in pyAMReX - copy : bool, optional - Copy the data if true, otherwise create a view (default). - order : string, optional - F order (default) or C. C is faster with external libraries. - - Returns - ------- - list of dpnp.array - A list of dpnp n-dimensional arrays, for each local block in the - MultiFab. - - Raises - ------ - ImportError - Raises an exception if dpnp is not installed - """ - views = [] - for mfi in self: - views.append(self.array(mfi).to_dpnp(copy, order)) - - return views - - def mf_to_xp(self, copy=False, order="F"): """ Provide a NumPy, CuPy or dpnp view into a MultiFab, @@ -701,8 +621,6 @@ def register_MultiFab_extension(amr): amr.MultiFab.__iter__ = lambda mfab: amr.MFIter(mfab) amr.MultiFab.to_numpy = mf_to_numpy - amr.MultiFab.to_cupy = mf_to_cupy - amr.MultiFab.to_dpnp = mf_to_dpnp amr.MultiFab.to_xp = mf_to_xp amr.MultiFab.copy = lambda self: copy_multifab(amr, self) @@ -718,7 +636,6 @@ def register_MultiFab_extension(amr): amr.iMultiFab.__iter__ = lambda imfab: amr.MFIter(imfab) amr.iMultiFab.to_numpy = mf_to_numpy - amr.iMultiFab.to_cupy = mf_to_cupy amr.iMultiFab.to_xp = mf_to_xp amr.iMultiFab.copy = lambda self: copy_multifab(amr, self) From 9b942d2ccb444d7f16a0e1277a5e8afbdce5faaa Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Mon, 28 Jul 2025 08:46:08 -0700 Subject: [PATCH 13/16] Bind `DLDeviceType` --- src/Base/Array4.H | 2 +- src/CMakeLists.txt | 1 + src/dlpack/CMakeLists.txt | 6 ++++++ src/dlpack/DLPack.cpp | 34 ++++++++++++++++++++++++++++++++++ src/{ => dlpack}/dlpack.h | 0 src/pyAMReX.cpp | 2 ++ 6 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 src/dlpack/CMakeLists.txt create mode 100644 src/dlpack/DLPack.cpp rename src/{ => dlpack}/dlpack.h (100%) diff --git a/src/Base/Array4.H b/src/Base/Array4.H index 91bc7f07..5f03c381 100644 --- a/src/Base/Array4.H +++ b/src/Base/Array4.H @@ -6,7 +6,7 @@ #pragma once #include "pyAMReX.H" -#include "dlpack.h" +#include "dlpack/dlpack.h" #include #include diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 97a3e483..d8713f2f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,6 +2,7 @@ add_subdirectory(AmrCore) add_subdirectory(Base) #add_subdirectory(Boundary) +add_subdirectory(dlpack) #add_subdirectory(EB) #add_subdirectory(Extern) #add_subdirectory(LinearSolvers) diff --git a/src/dlpack/CMakeLists.txt b/src/dlpack/CMakeLists.txt new file mode 100644 index 00000000..2a27f9a7 --- /dev/null +++ b/src/dlpack/CMakeLists.txt @@ -0,0 +1,6 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + target_sources(pyAMReX_${D}d + PRIVATE + DLPack.cpp + ) +endforeach() diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp new file mode 100644 index 00000000..eaad9f3c --- /dev/null +++ b/src/dlpack/DLPack.cpp @@ -0,0 +1,34 @@ +#include "pyAMReX.H" + +#include "dlpack.h" + + +void init_DLPack(py::module& m) +{ + using namespace amrex; + + // register types only if not already present, e.g., from another library + // that also implements DLPack bindings and exposes the types + + py::type pyDLDeviceType = py::type::of(); + if (!pyDLDeviceType) { + py::native_enum(m, "DLDeviceType", "enum.IntEnum") + .value("kDLCPU", DLDeviceType::kDLCPU) + .value("kDLCUDA", DLDeviceType::kDLCUDA) + .value("kDLCUDAHost", DLDeviceType::kDLCUDAHost) + .value("kDLOpenCL", DLDeviceType::kDLOpenCL) + .value("kDLVulkan", DLDeviceType::kDLVulkan) + .value("kDLMetal", DLDeviceType::kDLMetal) + .value("kDLVPI", DLDeviceType::kDLVPI) + .value("kDLROCM", DLDeviceType::kDLROCM) + .value("kDLROCMHost", DLDeviceType::kDLROCMHost) + .value("kDLExtDev", DLDeviceType::kDLExtDev) + .value("kDLCUDAManaged", DLDeviceType::kDLCUDAManaged) + .value("kDLOneAPI", DLDeviceType::kDLOneAPI) + .value("kDLWebGPU", DLDeviceType::kDLWebGPU) + .value("kDLHexagon", DLDeviceType::kDLHexagon) + .value("kDLMAIA", DLDeviceType::kDLMAIA) + ; + } + +} diff --git a/src/dlpack.h b/src/dlpack/dlpack.h similarity index 100% rename from src/dlpack.h rename to src/dlpack/dlpack.h diff --git a/src/pyAMReX.cpp b/src/pyAMReX.cpp index 36ce03d0..ab9ccfa9 100644 --- a/src/pyAMReX.cpp +++ b/src/pyAMReX.cpp @@ -20,6 +20,7 @@ void init_Arena(py::module&); void init_Array4(py::module&); void init_BaseFab(py::module&); void init_Box(py::module &); +void init_DLPack(py::module &); void init_RealBox(py::module &); void init_BoxArray(py::module &); void init_CoordSys(py::module&); @@ -98,6 +99,7 @@ PYBIND11_MODULE(amrex_3d_pybind, m) { // note: order from parent to child classes and argument usage + init_DLPack(m); init_AMReX(m); init_Arena(m); init_Dim3(m); From 63a007168b0d9c5b435c68402e82a6b6396454e4 Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Mon, 28 Jul 2025 14:47:39 -0700 Subject: [PATCH 14/16] Simplify Signed-off-by: Axel Huebl --- src/dlpack/DLPack.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp index eaad9f3c..493ec1e0 100644 --- a/src/dlpack/DLPack.cpp +++ b/src/dlpack/DLPack.cpp @@ -10,7 +10,7 @@ void init_DLPack(py::module& m) // register types only if not already present, e.g., from another library // that also implements DLPack bindings and exposes the types - py::type pyDLDeviceType = py::type::of(); + py::type pyDLDeviceType = false; // TODO: py::type::of(); if (!pyDLDeviceType) { py::native_enum(m, "DLDeviceType", "enum.IntEnum") .value("kDLCPU", DLDeviceType::kDLCPU) From ac9e5bc0c5a024522f0aa20d247be272a909436a Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Mon, 28 Jul 2025 14:48:33 -0700 Subject: [PATCH 15/16] Simplify Signed-off-by: Axel Huebl --- src/dlpack/DLPack.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp index 493ec1e0..29d806a9 100644 --- a/src/dlpack/DLPack.cpp +++ b/src/dlpack/DLPack.cpp @@ -10,7 +10,8 @@ void init_DLPack(py::module& m) // register types only if not already present, e.g., from another library // that also implements DLPack bindings and exposes the types - py::type pyDLDeviceType = false; // TODO: py::type::of(); + // TODO: py::type pyDLDeviceType = py::type::of(); + bool pyDLDeviceType = false; if (!pyDLDeviceType) { py::native_enum(m, "DLDeviceType", "enum.IntEnum") .value("kDLCPU", DLDeviceType::kDLCPU) From b6e03919d5fe29d593d0a977a4f0f977702a4548 Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Fri, 7 Nov 2025 14:48:11 -0800 Subject: [PATCH 16/16] Fix enum: export/finalize Signed-off-by: Axel Huebl --- src/dlpack/DLPack.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp index 29d806a9..72a1a8e2 100644 --- a/src/dlpack/DLPack.cpp +++ b/src/dlpack/DLPack.cpp @@ -29,6 +29,8 @@ void init_DLPack(py::module& m) .value("kDLWebGPU", DLDeviceType::kDLWebGPU) .value("kDLHexagon", DLDeviceType::kDLHexagon) .value("kDLMAIA", DLDeviceType::kDLMAIA) + .export_values() + .finalize() ; }