From adb52c8417f543556394e714bc5a5ca34f394ac5 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Tue, 22 Jul 2025 17:48:57 -0700
Subject: [PATCH 01/16] Vibe Start

---
 src/Base/Array4.H | 111 ++++++++++++++++++++++++++++++++++++++++++++--
 src/Base/dlpack.h |  66 +++++++++++++++++++++++++++
 2 files changed, 174 insertions(+), 3 deletions(-)
 create mode 100644 src/Base/dlpack.h
diff --git a/src/Base/Array4.H b/src/Base/Array4.H
index a0ede4cb..96cb84f9 100644
--- a/src/Base/Array4.H
+++ b/src/Base/Array4.H
@@ -18,6 +18,15 @@
 #include <sstream>
 #include <type_traits>
 #include <vector>
+#include "dlpack.h"
+
+// GPU backend headers for device detection
+#ifdef AMREX_USE_CUDA
+#include <cuda_runtime.h>
+#endif
+#ifdef AMREX_USE_HIP
+#include <hip/hip_runtime.h>
+#endif
 
 
 namespace
@@ -222,13 +231,109 @@ namespace pyAMReX
             })
 
 
-            // TODO: __dlpack__ __dlpack_device__
             // DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
             // https://dmlc.github.io/dlpack/latest/
-            // https://data-apis.org/array-api/latest/design_topics/data_interchange.html
-            // https://github.com/data-apis/consortium-feedback/issues/1
             // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
             // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol
+            .def("__dlpack__", [](Array4<T> const &a4, py::handle stream = py::none()) {
+                // Allocate shape/strides arrays
+                constexpr int ndim = 4;
+                auto const len = length(a4);
+                auto *shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x};
+                auto *strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1};
+                // DLPack dtype
+                DLDataType dtype{};
+                if constexpr (std::is_same_v<T, float>) { dtype.code = kDLFloat; dtype.bits = 32; dtype.lanes = 1; }
+                else if constexpr (std::is_same_v<T, double>) { dtype.code = kDLFloat; dtype.bits = 64; dtype.lanes = 1; }
+                else if constexpr (std::is_same_v<T, int32_t>) { dtype.code = kDLInt; dtype.bits = 32; dtype.lanes = 1; }
+                else if constexpr (std::is_same_v<T, int64_t>) { dtype.code = kDLInt; dtype.bits = 64; dtype.lanes = 1; }
+                else if constexpr (std::is_same_v<T, uint32_t>) { dtype.code = kDLUInt; dtype.bits = 32; dtype.lanes = 1; }
+                else if constexpr (std::is_same_v<T, uint64_t>) { dtype.code = kDLUInt; dtype.bits = 64; dtype.lanes = 1; }
+                else { throw std::runtime_error("Unsupported dtype for DLPack"); }
+
+                // Device detection based on AMReX GPU backend
+                DLDevice device{ kDLCPU, 0 };
+#ifdef AMREX_USE_CUDA
+                // Check if data is on GPU by checking if pointer is in CUDA memory
+                cudaPointerAttributes attr;
+                cudaError_t err = cudaPointerGetAttributes(&attr, a4.dataPtr());
+                if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) {
+                    device.device_type = kDLCUDA;
+                    device.device_id = attr.device;
+                }
+#elif defined(AMREX_USE_HIP)
+                // Check if data is on GPU by checking if pointer is in HIP memory
+                hipPointerAttribute_t attr;
+                hipError_t err = hipPointerGetAttributes(&attr, a4.dataPtr());
+                if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) {
+                    device.device_type = kDLROCM;
+                    device.device_id = attr.device;
+                }
+#elif defined(AMREX_USE_DPCPP)
+                // For SYCL, we need to check if the data is on device
+                // This is more complex as SYCL doesn't have a simple pointer check
+                // For now, assume CPU - SYCL support would need more sophisticated detection
+                // device.device_type = kDLExtDev;  // SYCL would use extended device type
+                // device.device_id = 0;
+#endif
+
+                // Construct DLTensor
+                auto *dl_tensor = new DLManagedTensor;
+                dl_tensor->dl_tensor.data = const_cast<void*>(static_cast<const void*>(a4.dataPtr()));
+                dl_tensor->dl_tensor.device = device;
+                dl_tensor->dl_tensor.ndim = ndim;
+                dl_tensor->dl_tensor.dtype = dtype;
+                dl_tensor->dl_tensor.shape = shape;
+                dl_tensor->dl_tensor.strides = strides;
+                dl_tensor->dl_tensor.byte_offset = 0;
+                dl_tensor->manager_ctx = nullptr;
+                dl_tensor->deleter = [](DLManagedTensor *self) {
+                    delete[] self->dl_tensor.shape;
+                    delete[] self->dl_tensor.strides;
+                    delete self;
+                };
+                // Return as Python capsule
+                return py::capsule(dl_tensor, "dltensor", [](void* ptr) {
+                    auto* tensor = static_cast<DLManagedTensor*>(ptr);
+                    tensor->deleter(tensor);
+                });
+            }, py::arg("stream") = py::none(), R"doc(
+                DLPack protocol for zero-copy tensor exchange.
+                See https://dmlc.github.io/dlpack/latest/ for details.
+            )doc")
+            .def("__dlpack_device__", [](Array4<T> const &a4) {
+                // Device detection based on AMReX GPU backend
+                int device_type = kDLCPU;
+                int device_id = 0;
+
+#ifdef AMREX_USE_CUDA
+                // Check if data is on GPU by checking if pointer is in CUDA memory
+                cudaPointerAttributes attr;
+                cudaError_t err = cudaPointerGetAttributes(&attr, a4.dataPtr());
+                if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) {
+                    device_type = kDLCUDA;
+                    device_id = attr.device;
+                }
+#elif defined(AMREX_USE_HIP)
+                // Check if data is on GPU by checking if pointer is in HIP memory
+                hipPointerAttribute_t attr;
+                hipError_t err = hipPointerGetAttributes(&attr, a4.dataPtr());
+                if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) {
+                    device_type = kDLROCM;
+                    device_id = attr.device;
+                }
+#elif defined(AMREX_USE_DPCPP)
+                // For SYCL, we need to check if the data is on device
+                // This is more complex as SYCL doesn't have a simple pointer check
+                // For now, assume CPU - SYCL support would need more sophisticated detection
+                // device_type = kDLExtDev;  // SYCL would use extended device type
+                // device_id = 0;
+#endif
+
+                return std::make_tuple(device_type, device_id);
+            }, R"doc(
+                DLPack device info (device_type, device_id).
+            )doc")
 
             .def("to_host", [](Array4<T> const & a4) {
                 // py::tuple to std::vector
diff --git a/src/Base/dlpack.h b/src/Base/dlpack.h
new file mode 100644
index 00000000..812b7640
--- /dev/null
+++ b/src/Base/dlpack.h
@@ -0,0 +1,66 @@
+#ifndef AMREX_DLPACK_H_
+#define AMREX_DLPACK_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+// Device type codes
+#define kDLCPU 1
+#define kDLCUDA 2
+#define kDLCUDAHost 3
+#define kDLOpenCL 4
+#define kDLVulkan 7
+#define kDLMetal 8
+#define kDLVPI 9
+#define kDLROCM 10
+#define kDLROCMHost 11
+#define kDLExtDev 12
+
+// Data type codes
+#define kDLInt 0
+#define kDLUInt 1
+#define kDLFloat 2
+
+// Device context
+typedef struct {
+    int32_t device_type;
+    int32_t device_id;
+} DLDevice;
+
+// Data type
+typedef struct {
+    uint8_t code;   // kDLFloat=2, kDLInt=0, kDLUInt=1
+    uint8_t bits;   // number of bits, e.g., 32, 64
+    uint16_t lanes; // number of lanes (for vector types)
+} DLDataType;
+
+// Tensor structure
+typedef struct {
+    void* data;
+    DLDevice device;
+    int32_t ndim;
+    int64_t* shape;
+    int64_t* strides; // in elements, not bytes; can be NULL for compact
+    uint64_t byte_offset;
+    DLDataType dtype;
+} DLTensor;
+
+// Managed tensor with deleter
+struct DLManagedTensor;
+typedef void (*DLManagedTensorDeleter)(struct DLManagedTensor* self);
+
+typedef struct DLManagedTensor {
+    DLTensor dl_tensor;
+    void* manager_ctx;
+    DLManagedTensorDeleter deleter;
+} DLManagedTensor;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AMREX_DLPACK_H_

From 32e4535b5f16ff0e73deace5f68d635bdb46b86a Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Tue, 22 Jul 2025 20:02:05 -0700
Subject: [PATCH 02/16] Cleaning

---
 src/Base/Array4.H |  91 ++-------
 src/Base/dlpack.h |  66 -------
 src/dlpack.h      | 466 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 479 insertions(+), 144 deletions(-)
 delete mode 100644 src/Base/dlpack.h
 create mode 100644 src/dlpack.h

diff --git a/src/Base/Array4.H b/src/Base/Array4.H
index 96cb84f9..a35f1c98 100644
--- a/src/Base/Array4.H
+++ b/src/Base/Array4.H
@@ -6,6 +6,7 @@
 #pragma once
 
 #include "pyAMReX.H"
+#include "dlpack.h"
 
 #include <AMReX_Array4.H>
 #include <AMReX_BLassert.H>
@@ -18,15 +19,6 @@
 #include <sstream>
 #include <type_traits>
 #include <vector>
-#include "dlpack.h"
-
-// GPU backend headers for device detection
-#ifdef AMREX_USE_CUDA
-#include <cuda_runtime.h>
-#endif
-#ifdef AMREX_USE_HIP
-#include <hip/hip_runtime.h>
-#endif
 
 
 namespace
@@ -194,6 +186,7 @@ namespace pyAMReX
              */
 
 
+            /*
             // CPU: __array_interface__ v3
             // https://numpy.org/doc/stable/reference/arrays.interface.html
             .def_property_readonly("__array_interface__", [](Array4<T> const & a4) {
@@ -229,60 +222,26 @@ namespace pyAMReX
                 d["version"] = 3;
                 return d;
             })
+            */
 
 
             // DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
             // https://dmlc.github.io/dlpack/latest/
             // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
             // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol
-            .def("__dlpack__", [](Array4<T> const &a4, py::handle stream = py::none()) {
+            .def("__dlpack__", [](Array4<T> const &a4, [[maybe_unused]] py::handle stream = py::none()) {
                 // Allocate shape/strides arrays
                 constexpr int ndim = 4;
                 auto const len = length(a4);
                 auto *shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x};
                 auto *strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1};
-                // DLPack dtype
-                DLDataType dtype{};
-                if constexpr (std::is_same_v<T, float>) { dtype.code = kDLFloat; dtype.bits = 32; dtype.lanes = 1; }
-                else if constexpr (std::is_same_v<T, double>) { dtype.code = kDLFloat; dtype.bits = 64; dtype.lanes = 1; }
-                else if constexpr (std::is_same_v<T, int32_t>) { dtype.code = kDLInt; dtype.bits = 32; dtype.lanes = 1; }
-                else if constexpr (std::is_same_v<T, int64_t>) { dtype.code = kDLInt; dtype.bits = 64; dtype.lanes = 1; }
-                else if constexpr (std::is_same_v<T, uint32_t>) { dtype.code = kDLUInt; dtype.bits = 32; dtype.lanes = 1; }
-                else if constexpr (std::is_same_v<T, uint64_t>) { dtype.code = kDLUInt; dtype.bits = 64; dtype.lanes = 1; }
-                else { throw std::runtime_error("Unsupported dtype for DLPack"); }
-
-                // Device detection based on AMReX GPU backend
-                DLDevice device{ kDLCPU, 0 };
-#ifdef AMREX_USE_CUDA
-                // Check if data is on GPU by checking if pointer is in CUDA memory
-                cudaPointerAttributes attr;
-                cudaError_t err = cudaPointerGetAttributes(&attr, a4.dataPtr());
-                if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) {
-                    device.device_type = kDLCUDA;
-                    device.device_id = attr.device;
-                }
-#elif defined(AMREX_USE_HIP)
-                // Check if data is on GPU by checking if pointer is in HIP memory
-                hipPointerAttribute_t attr;
-                hipError_t err = hipPointerGetAttributes(&attr, a4.dataPtr());
-                if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) {
-                    device.device_type = kDLROCM;
-                    device.device_id = attr.device;
-                }
-#elif defined(AMREX_USE_DPCPP)
-                // For SYCL, we need to check if the data is on device
-                // This is more complex as SYCL doesn't have a simple pointer check
-                // For now, assume CPU - SYCL support would need more sophisticated detection
-                // device.device_type = kDLExtDev;  // SYCL would use extended device type
-                // device.device_id = 0;
-#endif
 
                 // Construct DLTensor
                 auto *dl_tensor = new DLManagedTensor;
                 dl_tensor->dl_tensor.data = const_cast<void*>(static_cast<const void*>(a4.dataPtr()));
-                dl_tensor->dl_tensor.device = device;
+                dl_tensor->dl_tensor.device = dlpack::detect_device_from_pointer(a4.dataPtr());
                 dl_tensor->dl_tensor.ndim = ndim;
-                dl_tensor->dl_tensor.dtype = dtype;
+                dl_tensor->dl_tensor.dtype = dlpack::get_dlpack_dtype<T>();
                 dl_tensor->dl_tensor.shape = shape;
                 dl_tensor->dl_tensor.strides = strides;
                 dl_tensor->dl_tensor.byte_offset = 0;
@@ -297,40 +256,16 @@ namespace pyAMReX
                     auto* tensor = static_cast<DLManagedTensor*>(ptr);
                     tensor->deleter(tensor);
                 });
-            }, py::arg("stream") = py::none(), R"doc(
+            },
+                py::arg("stream") = py::none(),
+                R"doc(
                 DLPack protocol for zero-copy tensor exchange.
                 See https://dmlc.github.io/dlpack/latest/ for details.
-            )doc")
+                )doc"
+            )
             .def("__dlpack_device__", [](Array4<T> const &a4) {
-                // Device detection based on AMReX GPU backend
-                int device_type = kDLCPU;
-                int device_id = 0;
-
-#ifdef AMREX_USE_CUDA
-                // Check if data is on GPU by checking if pointer is in CUDA memory
-                cudaPointerAttributes attr;
-                cudaError_t err = cudaPointerGetAttributes(&attr, a4.dataPtr());
-                if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) {
-                    device_type = kDLCUDA;
-                    device_id = attr.device;
-                }
-#elif defined(AMREX_USE_HIP)
-                // Check if data is on GPU by checking if pointer is in HIP memory
-                hipPointerAttribute_t attr;
-                hipError_t err = hipPointerGetAttributes(&attr, a4.dataPtr());
-                if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) {
-                    device_type = kDLROCM;
-                    device_id = attr.device;
-                }
-#elif defined(AMREX_USE_DPCPP)
-                // For SYCL, we need to check if the data is on device
-                // This is more complex as SYCL doesn't have a simple pointer check
-                // For now, assume CPU - SYCL support would need more sophisticated detection
-                // device_type = kDLExtDev;  // SYCL would use extended device type
-                // device_id = 0;
-#endif
-
-                return std::make_tuple(device_type, device_id);
+                DLDevice device = dlpack::detect_device_from_pointer(a4.dataPtr());
+                return std::make_tuple(device.device_type, device.device_id);
             }, R"doc(
                 DLPack device info (device_type, device_id).
             )doc")
diff --git a/src/Base/dlpack.h b/src/Base/dlpack.h
deleted file mode 100644
index 812b7640..00000000
--- a/src/Base/dlpack.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef AMREX_DLPACK_H_
-#define AMREX_DLPACK_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stddef.h>
-
-// Device type codes
-#define kDLCPU 1
-#define kDLCUDA 2
-#define kDLCUDAHost 3
-#define kDLOpenCL 4
-#define kDLVulkan 7
-#define kDLMetal 8
-#define kDLVPI 9
-#define kDLROCM 10
-#define kDLROCMHost 11
-#define kDLExtDev 12
-
-// Data type codes
-#define kDLInt 0
-#define kDLUInt 1
-#define kDLFloat 2
-
-// Device context
-typedef struct {
-    int32_t device_type;
-    int32_t device_id;
-} DLDevice;
-
-// Data type
-typedef struct {
-    uint8_t code;   // kDLFloat=2, kDLInt=0, kDLUInt=1
-    uint8_t bits;   // number of bits, e.g., 32, 64
-    uint16_t lanes; // number of lanes (for vector types)
-} DLDataType;
-
-// Tensor structure
-typedef struct {
-    void* data;
-    DLDevice device;
-    int32_t ndim;
-    int64_t* shape;
-    int64_t* strides; // in elements, not bytes; can be NULL for compact
-    uint64_t byte_offset;
-    DLDataType dtype;
-} DLTensor;
-
-// Managed tensor with deleter
-struct DLManagedTensor;
-typedef void (*DLManagedTensorDeleter)(struct DLManagedTensor* self);
-
-typedef struct DLManagedTensor {
-    DLTensor dl_tensor;
-    void* manager_ctx;
-    DLManagedTensorDeleter deleter;
-} DLManagedTensor;
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AMREX_DLPACK_H_
diff --git a/src/dlpack.h b/src/dlpack.h
new file mode 100644
index 00000000..f962e270
--- /dev/null
+++ b/src/dlpack.h
@@ -0,0 +1,466 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ *
+ * Source: https://github.com/dmlc/dlpack/blob/v1.1/include/dlpack/dlpack.h
+ */
+#ifndef AMREX_DLPACK_H_
+#define AMREX_DLPACK_H_
+
+#include <AMReX_Extension.H>
+#include <AMReX_Gpu.H>
+
+#include <stdexcept>
+#include <type_traits>
+
+extern "C" {
+
+#include <stdint.h>
+#include <stddef.h>
+
+/*!
+ * \brief The DLPack version.
+ *
+ * A change in major version indicates that we have changed the
+ * data layout of the ABI - DLManagedTensorVersioned.
+ *
+ * A change in minor version indicates that we have added new
+ * code, such as a new device type, but the ABI is kept the same.
+ *
+ * If an obtained DLPack tensor has a major version that disagrees
+ * with the version number specified in this header file
+ * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+ * (and it is safe to do so). It is not safe to access any other fields
+ * as the memory layout will have changed.
+ *
+ * In the case of a minor version mismatch, the tensor can be safely used as
+ * long as the consumer knows how to interpret all fields. Minor version
+ * updates indicate the addition of enumeration values.
+ */
+typedef struct {
+  /*! \brief DLPack major version. */
+  uint32_t major;
+  /*! \brief DLPack minor version. */
+  uint32_t minor;
+} DLPackVersion;
+
+/*!
+ * \brief The device type in DLDevice.
+ */
+typedef enum : int32_t {
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
+  /*! \brief Microsoft MAIA devices */
+  kDLMAIA = 17,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int32_t device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+  /*! \brief boolean */
+  kDLBool = 6U,
+  /*! \brief FP8 data types */
+  kDLFloat8_e3m4 = 7U,
+  kDLFloat8_e4m3 = 8U,
+  kDLFloat8_e4m3b11fnuz = 9U,
+  kDLFloat8_e4m3fn = 10U,
+  kDLFloat8_e4m3fnuz = 11U,
+  kDLFloat8_e5m2 = 12U,
+  kDLFloat8_e5m2fnuz = 13U,
+  kDLFloat8_e8m0fnu = 14U,
+  /*! \brief FP6 data types
+   * Setting bits != 6 is currently unspecified, and the producer must ensure it is set
+   * while the consumer must stop importing if the value is unexpected.
+   */
+  kDLFloat6_e2m3fn = 15U,
+  kDLFloat6_e3m2fn = 16U,
+  /*! \brief FP4 data types
+   * Setting bits != 4 is currently unspecified, and the producer must ensure it is set
+   * while the consumer must stop importing if the value is unexpected.
+   */
+  kDLFloat4_e2m1fn = 17U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
+ *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
+ *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
+ *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
+ *
+ *  When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
+ *  for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   *
+   * Note that if the tensor is of size zero, then the data pointer should be
+   * set to `NULL`.
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ *
+ * \note This data structure is used as Legacy DLManagedTensor
+ *       in DLPack exchange and is deprecated after DLPack v0.8
+ *       Use DLManagedTensorVersioned instead.
+ *       This data structure may get renamed or deleted in future versions.
+ *
+ * \sa DLManagedTensorVersioned
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void * manager_ctx;
+  /*!
+   * \brief Destructor - this should be called
+   * to destruct the manager_ctx  which backs the DLManagedTensor. It can be
+   * NULL if there is no way for the caller to provide a reasonable destructor.
+   * The destructor deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor * self);
+} DLManagedTensor;
+
+// bit masks used in in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief bit mask to indicate that the tensor is a copy made by the producer.
+ *
+ * If set, the tensor is considered solely owned throughout its lifetime by the
+ * consumer, until the producer-provided deleter is invoked.
+ */
+#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
+
+/*
+ * \brief bit mask to indicate that whether a sub-byte type is packed or padded.
+ *
+ * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
+ * be set by the producer to signal that a tensor of sub-byte type is padded.
+ */
+#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
+
+/*!
+ * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
+ *
+ * This data structure is intended to facilitate the borrowing of DLTensor by
+ * another framework. It is not meant to transfer the tensor. When the borrowing
+ * framework doesn't need the tensor, it should call the deleter to notify the
+ * host that the resource is no longer needed.
+ *
+ * \note This is the current standard DLPack exchange data structure.
+ */
+struct DLManagedTensorVersioned {
+  /*!
+   * \brief The API and ABI version of the current managed Tensor
+   */
+  DLPackVersion version;
+  /*!
+   * \brief the context of the original host framework.
+   *
+   * Stores DLManagedTensorVersioned is used in the
+   * framework. It can also be NULL.
+   */
+  void *manager_ctx;
+  /*!
+   * \brief Destructor.
+   *
+   * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned.
+   * It can be NULL if there is no way for the caller to provide a reasonable
+   * destructor. The destructor deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensorVersioned *self);
+  /*!
+   * \brief Additional bitmask flags information about the tensor.
+   *
+   * By default the flags should be set to 0.
+   *
+   * \note Future ABI changes should keep everything until this field
+   *       stable, to ensure that deleter can be correctly called.
+   *
+   * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+   * \sa DLPACK_FLAG_BITMASK_IS_COPIED
+   */
+  uint64_t flags;
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+};
+
+} // extern "C"
+
+namespace pyAMReX::dlpack
+{
+
+    template<typename T>
+    AMREX_INLINE
+    DLDataType get_dlpack_dtype ()
+    {
+        DLDataType dtype{};
+
+        if constexpr (std::is_same_v<T, float>) {
+            dtype.code = kDLFloat;
+            dtype.bits = 32;
+            dtype.lanes = 1;
+        }
+        else if constexpr (std::is_same_v<T, double>) {
+            dtype.code = kDLFloat;
+            dtype.bits = 64;
+            dtype.lanes = 1;
+        }
+        else if constexpr (std::is_same_v<T, int32_t>) {
+            dtype.code = kDLInt;
+            dtype.bits = 32;
+            dtype.lanes = 1;
+        }
+        else if constexpr (std::is_same_v<T, int64_t>) {
+            dtype.code = kDLInt;
+            dtype.bits = 64;
+            dtype.lanes = 1;
+        }
+        else if constexpr (std::is_same_v<T, uint32_t>) {
+            dtype.code = kDLUInt;
+            dtype.bits = 32;
+            dtype.lanes = 1;
+        }
+        else if constexpr (std::is_same_v<T, uint64_t>) {
+            dtype.code = kDLUInt;
+            dtype.bits = 64;
+            dtype.lanes = 1;
+        }
+        else {
+            throw std::runtime_error("Unsupported dtype for DLPack");
+        }
+
+        return dtype;
+    }
+
+    AMREX_INLINE
+    DLDevice detect_device_from_pointer ([[maybe_unused]] const void* ptr)
+    {
+        DLDevice device{ kDLCPU, 0 };
+
+#ifdef AMREX_USE_CUDA
+        // Check if data is on GPU by checking if pointer is in CUDA memory
+        // note: cudaPointerGetAttributes is quite expensive, remove and
+        //       assume device-side if need be.
+        cudaPointerAttributes attr;
+        cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
+        if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) {
+            device.device_type = kDLCUDA;
+            device.device_id = attr.device;
+        }
+#elif defined(AMREX_USE_HIP)
+        // Check if data is on GPU by checking if pointer is in HIP memory
+        // note: hipPointerGetAttributes is quite expensive, remove and
+        //       assume device-side if need be.
+        hipPointerAttribute_t attr;
+        hipError_t err = hipPointerGetAttributes(&attr, ptr);
+        if (err == hipSuccess && attr.memoryType == hipMemoryTypeDevice) {
+            device.device_type = kDLROCM;
+            device.device_id = attr.device;
+        }
+
+#elif defined(AMREX_USE_DPCPP)
+        // try {
+            // Get the SYCL context and queue from AMReX
+            auto const& queue = amrex::Gpu::Device::streamQueue();
+            auto const& context = queue.get_context();
+
+            // Try to get pointer attributes using SYCL USM queries
+            auto usm_type = sycl::get_pointer_type(ptr, context);
+
+            if (usm_type == sycl::usm::alloc::device ||
+                usm_type == sycl::usm::alloc::shared) {
+                device.device_type = kDLOneAPI;
+
+                // Try to get the actual device from the pointer
+                try {
+                    auto device_ptr = sycl::get_pointer_device(ptr, context);
+                    device.device_id = 0;  // Default to first device
+
+                    auto devices = context.get_devices();
+                    for (size_t i = 0; i < devices.size(); ++i) {
+                        if (devices[i] == device_ptr) {
+                            device.device_id = static_cast<int32_t>(i);
+                            break;
+                        }
+                    }
+                } catch (const sycl::exception&) {
+                    // If we can't determine the specific device, default to 0
+                    device.device_id = 0;
+                }
+            } else if (usm_type == sycl::usm::alloc::host) {
+                // Host USM allocation - still oneAPI but accessible from host
+                device.device_type = kDLOneAPI;
+                device.device_id = 0;
+            }
+            // If usm_type is sycl::usm::alloc::unknown, it might be regular CPU memory
+            // In that case, we keep the default CPU device type set above
+
+        /*
+        }
+        catch (const sycl::exception&) {
+            // If SYCL queries fail, assume it's regular CPU memory
+            // device remains as kDLCPU, 0 (set at function start)
+        } catch (...) {
+            // Handle any other exceptions gracefully
+            // device remains as kDLCPU, 0 (set at function start)
+        }
+        */
+#endif
+
+        return device;
+    }
+
+} // namespace pyAMReX::dlpack
+
+#endif // AMREX_DLPACK_H_

From 0349655b0194bdd397ea98b66edfa135eac1de04 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Tue, 22 Jul 2025 22:38:07 -0700
Subject: [PATCH 03/16] More Cleanup

DLPack 1.1, e.g., in NumPy 2.1+

Tests do not yet pass.
---
 src/Base/Array4.H    | 67 +++++++++++++++++++++++++++++---------------
 src/dlpack.h         | 49 +++++++-------------------------
 tests/test_array4.py | 14 +++++----
 3 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/src/Base/Array4.H b/src/Base/Array4.H
index a35f1c98..c6285eb4 100644
--- a/src/Base/Array4.H
+++ b/src/Base/Array4.H
@@ -225,39 +225,60 @@ namespace pyAMReX
             */
 
 
-            // DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
+            // DLPack v1.1 protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
             // https://dmlc.github.io/dlpack/latest/
             // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
             // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol
-            .def("__dlpack__", [](Array4<T> const &a4, [[maybe_unused]] py::handle stream = py::none()) {
+            .def("__dlpack__", [](
+                    Array4<T> const &a4
+                    /* TODO:
+                    [[maybe_unused]] py::handle stream,
+                    [[maybe_unused]] std::tuple<int, int> max_version,
+                    [[maybe_unused]] std::tuple<DLDeviceType, int32_t> dl_device,
+                    [[maybe_unused]] bool copy
+                    */
+            )
+            {
                 // Allocate shape/strides arrays
                 constexpr int ndim = 4;
                 auto const len = length(a4);
-                auto *shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x};
-                auto *strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1};
-
-                // Construct DLTensor
-                auto *dl_tensor = new DLManagedTensor;
-                dl_tensor->dl_tensor.data = const_cast<void*>(static_cast<const void*>(a4.dataPtr()));
-                dl_tensor->dl_tensor.device = dlpack::detect_device_from_pointer(a4.dataPtr());
-                dl_tensor->dl_tensor.ndim = ndim;
-                dl_tensor->dl_tensor.dtype = dlpack::get_dlpack_dtype<T>();
-                dl_tensor->dl_tensor.shape = shape;
-                dl_tensor->dl_tensor.strides = strides;
-                dl_tensor->dl_tensor.byte_offset = 0;
-                dl_tensor->manager_ctx = nullptr;
-                dl_tensor->deleter = [](DLManagedTensor *self) {
+
+                // Construct DLManagedTensorVersioned (DLPack 1.1 standard)
+                auto *dl_mgt_tensor = new DLManagedTensorVersioned;
+                dl_mgt_tensor->version = DLPackVersion{};
+                dl_mgt_tensor->flags = 0; // No special flags
+                dl_mgt_tensor->dl_tensor.data = const_cast<void*>(static_cast<const void*>(a4.dataPtr()));
+                dl_mgt_tensor->dl_tensor.device = dlpack::detect_device_from_pointer(a4.dataPtr());
+                dl_mgt_tensor->dl_tensor.ndim = ndim;
+                dl_mgt_tensor->dl_tensor.dtype = dlpack::get_dlpack_dtype<T>();
+                dl_mgt_tensor->dl_tensor.shape = new int64_t[ndim]{a4.nComp(), len.z, len.y, len.x};
+                dl_mgt_tensor->dl_tensor.strides = new int64_t[ndim]{a4.nstride, a4.kstride, a4.jstride, 1};
+                dl_mgt_tensor->dl_tensor.byte_offset = 0;
+                dl_mgt_tensor->manager_ctx = nullptr;  // TODO: we can increase/decrease the Python ref counter of the producer here
+                dl_mgt_tensor->deleter = [](DLManagedTensorVersioned *self) {
                     delete[] self->dl_tensor.shape;
                     delete[] self->dl_tensor.strides;
                     delete self;
                 };
                 // Return as Python capsule
-                return py::capsule(dl_tensor, "dltensor", [](void* ptr) {
-                    auto* tensor = static_cast<DLManagedTensor*>(ptr);
-                    tensor->deleter(tensor);
-                });
+                return py::capsule(
+                    dl_mgt_tensor,
+                    "dltensor_versioned",
+                    /*[](void* ptr) {
+                        auto* tensor = static_cast<DLManagedTensorVersioned*>(ptr);
+                        tensor->deleter(tensor);
+                    }*/
+                    [](PyObject *capsule)
+                    {
+                        auto *p = static_cast<DLManagedTensorVersioned*>(
+                            PyCapsule_GetPointer(capsule, "dltensor_versioned"));
+                        if (p && p->deleter)
+                            p->deleter(p);
+                    }
+                );
             },
-                py::arg("stream") = py::none(),
+                //py::arg("stream") = py::none(),
+                // ... other args & their defaults
                 R"doc(
                 DLPack protocol for zero-copy tensor exchange.
                 See https://dmlc.github.io/dlpack/latest/ for details.
@@ -265,11 +286,13 @@ namespace pyAMReX
             )
             .def("__dlpack_device__", [](Array4<T> const &a4) {
                 DLDevice device = dlpack::detect_device_from_pointer(a4.dataPtr());
-                return std::make_tuple(device.device_type, device.device_id);
+                return std::make_tuple(static_cast<int32_t>(device.device_type), device.device_id);
             }, R"doc(
                 DLPack device info (device_type, device_id).
             )doc")
 
+
+
             .def("to_host", [](Array4<T> const & a4) {
                 // py::tuple to std::vector
                 auto const a4i = pyAMReX::array_interface(a4);
diff --git a/src/dlpack.h b/src/dlpack.h
index f962e270..4dd84dee 100644
--- a/src/dlpack.h
+++ b/src/dlpack.h
@@ -40,9 +40,9 @@ extern "C" {
  */
 typedef struct {
   /*! \brief DLPack major version. */
-  uint32_t major;
+  uint32_t major = 1;
   /*! \brief DLPack minor version. */
-  uint32_t minor;
+  uint32_t minor = 1;
 } DLPackVersion;
 
 /*!
@@ -238,37 +238,7 @@ typedef struct {
   uint64_t byte_offset;
 } DLTensor;
 
-/*!
- * \brief C Tensor object, manage memory of DLTensor. This data structure is
- *  intended to facilitate the borrowing of DLTensor by another framework. It is
- *  not meant to transfer the tensor. When the borrowing framework doesn't need
- *  the tensor, it should call the deleter to notify the host that the resource
- *  is no longer needed.
- *
- * \note This data structure is used as Legacy DLManagedTensor
- *       in DLPack exchange and is deprecated after DLPack v0.8
- *       Use DLManagedTensorVersioned instead.
- *       This data structure may get renamed or deleted in future versions.
- *
- * \sa DLManagedTensorVersioned
- */
-typedef struct DLManagedTensor {
-  /*! \brief DLTensor which is being memory managed */
-  DLTensor dl_tensor;
-  /*! \brief the context of the original host framework of DLManagedTensor in
-   *   which DLManagedTensor is used in the framework. It can also be NULL.
-   */
-  void * manager_ctx;
-  /*!
-   * \brief Destructor - this should be called
-   * to destruct the manager_ctx  which backs the DLManagedTensor. It can be
-   * NULL if there is no way for the caller to provide a reasonable destructor.
-   * The destructor deletes the argument self as well.
-   */
-  void (*deleter)(struct DLManagedTensor * self);
-} DLManagedTensor;
-
-// bit masks used in in the DLManagedTensorVersioned
+// bit masks used in the DLManagedTensorVersioned
 
 /*! \brief bit mask to indicate that the tensor is read only. */
 #define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
@@ -344,34 +314,35 @@ namespace pyAMReX::dlpack
     AMREX_INLINE
     DLDataType get_dlpack_dtype ()
     {
+        using V = std::decay_t<T>;
         DLDataType dtype{};
 
-        if constexpr (std::is_same_v<T, float>) {
+        if constexpr (std::is_same_v<V, float>) {
             dtype.code = kDLFloat;
             dtype.bits = 32;
             dtype.lanes = 1;
         }
-        else if constexpr (std::is_same_v<T, double>) {
+        else if constexpr (std::is_same_v<V, double>) {
             dtype.code = kDLFloat;
             dtype.bits = 64;
             dtype.lanes = 1;
         }
-        else if constexpr (std::is_same_v<T, int32_t>) {
+        else if constexpr (std::is_same_v<V, int32_t>) {
             dtype.code = kDLInt;
             dtype.bits = 32;
             dtype.lanes = 1;
         }
-        else if constexpr (std::is_same_v<T, int64_t>) {
+        else if constexpr (std::is_same_v<V, int64_t>) {
             dtype.code = kDLInt;
             dtype.bits = 64;
             dtype.lanes = 1;
         }
-        else if constexpr (std::is_same_v<T, uint32_t>) {
+        else if constexpr (std::is_same_v<V, uint32_t>) {
             dtype.code = kDLUInt;
             dtype.bits = 32;
             dtype.lanes = 1;
         }
-        else if constexpr (std::is_same_v<T, uint64_t>) {
+        else if constexpr (std::is_same_v<V, uint64_t>) {
             dtype.code = kDLUInt;
             dtype.bits = 64;
             dtype.lanes = 1;
diff --git a/tests/test_array4.py b/tests/test_array4.py
index 99c37d12..44570837 100644
--- a/tests/test_array4.py
+++ b/tests/test_array4.py
@@ -31,7 +31,9 @@ def test_array4():
     )
     print(f"\nx: {x.__array_interface__} {x.dtype}")
     arr = amr.Array4_double(x)
-    print(f"arr: {arr.__array_interface__}")
+    print(f"arr: DLPack device info: {arr.__dlpack_device__()}")
+    # print(f"arr: DLPack: {arr.__dlpack__()}")
+    print(f"x.shape: {x.shape}")
     print(arr)
     assert arr.nComp == 1
 
@@ -44,16 +46,16 @@ def test_array4():
     assert arr[0, 0, 0] == 1
     assert arr[3, 2, 1] == 1
 
-    # copy to numpy
-    c_arr2np = np.array(arr, copy=True)  # segfaults on Windows
+    # copy to numpy using DLPack
+    c_arr2np = np.from_dlpack(arr)
     assert c_arr2np.ndim == 4
     assert c_arr2np.dtype == np.dtype("double")
     print(f"c_arr2np: {c_arr2np.__array_interface__}")
     np.testing.assert_array_equal(x, c_arr2np[0, :, :, :])
     assert c_arr2np[0, 1, 1, 1] == 42
 
-    # view to numpy
-    v_arr2np = np.array(arr, copy=False)
+    # view to numpy using DLPack
+    v_arr2np = np.from_dlpack(arr)
     assert c_arr2np.ndim == 4
     assert v_arr2np.dtype == np.dtype("double")
     np.testing.assert_array_equal(x, v_arr2np[0, :, :, :])
@@ -65,7 +67,7 @@ def test_array4():
 
     # copy array4 (view)
     c_arr = amr.Array4_double(arr)
-    v_carr2np = np.array(c_arr, copy=False)
+    v_carr2np = np.from_dlpack(c_arr)
     x[1, 1, 1] = 44
     assert v_carr2np[0, 1, 1, 1] == 44
 

From 181145dc184787e80d3d4092dceda3887e825225 Mon Sep 17 00:00:00 2001
From: Roelof Groenewald <rgroenewald@tae.com>
Date: Wed, 23 Jul 2025 17:18:34 -0700
Subject: [PATCH 04/16] early return in `PyCapsule_Destructor` if the capsule
 name is "used_dltensor_versioned"

Signed-off-by: Roelof Groenewald <rgroenewald@tae.com>
---
 src/Base/Array4.H | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Base/Array4.H b/src/Base/Array4.H
index c6285eb4..66f6e488 100644
--- a/src/Base/Array4.H
+++ b/src/Base/Array4.H
@@ -270,6 +270,9 @@ namespace pyAMReX
                     }*/
                     [](PyObject *capsule)
                     {
+                        if (PyCapsule_IsValid(capsule, "used_dltensor_versioned")) {
+                            return;  /* Do nothing if the capsule has been consumed. */
+                        }
                         auto *p = static_cast<DLManagedTensorVersioned*>(
                             PyCapsule_GetPointer(capsule, "dltensor_versioned"));
                         if (p && p->deleter)
@@ -292,7 +295,6 @@ namespace pyAMReX
             )doc")
 
 
-
             .def("to_host", [](Array4<T> const & a4) {
                 // py::tuple to std::vector
                 auto const a4i = pyAMReX::array_interface(a4);

From 81929d7aee12a969d704b7588eb13b1804c31b09 Mon Sep 17 00:00:00 2001
From: Roelof Groenewald <rgroenewald@tae.com>
Date: Thu, 24 Jul 2025 22:55:05 -0700
Subject: [PATCH 05/16] accept keyword arguments in Array4.__dlpack__ and fix
 Cuda memory type check

Signed-off-by: Roelof Groenewald <rgroenewald@tae.com>
---
 src/Base/Array4.H | 5 +++--
 src/dlpack.h      | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/Base/Array4.H b/src/Base/Array4.H
index 66f6e488..bf3979ca 100644
--- a/src/Base/Array4.H
+++ b/src/Base/Array4.H
@@ -230,8 +230,9 @@ namespace pyAMReX
             // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
             // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol
             .def("__dlpack__", [](
-                    Array4<T> const &a4
-                    /* TODO:
+                    Array4<T> const &a4,
+                    py::kwargs kwargs
+                    /* TODO: Handle keyword arguments
                     [[maybe_unused]] py::handle stream,
                     [[maybe_unused]] std::tuple<int, int> max_version,
                     [[maybe_unused]] std::tuple<DLDeviceType, int32_t> dl_device,
diff --git a/src/dlpack.h b/src/dlpack.h
index 4dd84dee..d9054876 100644
--- a/src/dlpack.h
+++ b/src/dlpack.h
@@ -365,7 +365,7 @@ namespace pyAMReX::dlpack
         //       assume device-side if need be.
         cudaPointerAttributes attr;
         cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
-        if (err == cudaSuccess && attr.memoryType == cudaMemoryTypeDevice) {
+        if (err == cudaSuccess && attr.type == cudaMemoryTypeDevice) {
             device.device_type = kDLCUDA;
             device.device_id = attr.device;
         }

From b04171ca649f9a00e6386891feed523ac6ea07bf Mon Sep 17 00:00:00 2001
From: Roelof Groenewald <rgroenewald@tae.com>
Date: Fri, 25 Jul 2025 23:05:08 -0700
Subject: [PATCH 06/16] add `array4_to_dpnp` and modify `array4_to_xp` logic to
 distinguish GPU backends

Signed-off-by: Roelof Groenewald <rgroenewald@tae.com>
---
 src/amrex/extensions/Array4.py | 62 ++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/src/amrex/extensions/Array4.py b/src/amrex/extensions/Array4.py
index 8cd73f8a..6bf673b6 100644
--- a/src/amrex/extensions/Array4.py
+++ b/src/amrex/extensions/Array4.py
@@ -1,7 +1,7 @@
 """
 This file is part of pyAMReX
 
-Copyright 2023 AMReX community
+Copyright 2023-2025 AMReX community
 Authors: Axel Huebl
 License: BSD-3-Clause-LBNL
 """
@@ -92,9 +92,52 @@ def array4_to_cupy(self, copy=False, order="F"):
         raise ValueError("The order argument must be F or C.")
 
 
+def array4_to_dpnp(self, copy=False, order="F"):
+    """
+    Provide a dpnp view into an Array4.
+
+    This includes ngrow guard cells of the box.
+
+    Note on the order of indices:
+    By default, this is as in AMReX in Fortran contiguous order, indexing as
+    x,y,z. This has performance implications for use in external libraries such
+    as dpnp.
+    The order="C" option will index as z,y,x and may perform better.
+    https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074
+
+    Parameters
+    ----------
+    self : amrex.Array4_*
+        An Array4 class in pyAMReX
+    copy : bool, optional
+        Copy the data if true, otherwise create a view (default).
+    order : string, optional
+        F order (default) or C. C is faster with external libraries.
+
+    Returns
+    -------
+    dpnp.array
+        A dpnp n-dimensional array.
+
+    Raises
+    ------
+    ImportError
+        Raises an exception if dpnp is not installed
+    """
+    import dpnp as dp
+
+    if order == "F":
+        return dp.from_dlpack(self, copy=copy).T
+    elif order == "C":
+        return dp.from_dlpack(self, copy=copy)
+    else:
+        raise ValueError("The order argument must be F or C.")
+
+
 def array4_to_xp(self, copy=False, order="F"):
     """
-    Provide a NumPy or CuPy view into an Array4, depending on amr.Config.have_gpu .
+    Provide a NumPy, CuPy or dpnp view into an Array4, depending on amr.Config.have_gpu
+    and amr.Config.gpu_backend .
 
     This function is similar to CuPy's xp naming suggestion for CPU/GPU agnostic code:
     https://docs.cupy.dev/en/stable/user_guide/basic.html#how-to-write-cpu-gpu-agnostic-code
@@ -120,14 +163,20 @@ def array4_to_xp(self, copy=False, order="F"):
     Returns
     -------
     xp.array
-        A NumPy or CuPy n-dimensional array.
+        A NumPy, CuPy or dpnp n-dimensional array.
     """
     import inspect
 
     amr = inspect.getmodule(self)
-    return (
-        self.to_cupy(copy, order) if amr.Config.have_gpu else self.to_numpy(copy, order)
-    )
+
+    if amr.Config.have_gpu:
+        if amr.Config.gpu_backend == 'SYCL':
+            return self.to_dpnp(copy, order)
+        else:  # if not SYCL use cupy
+            return self.to_cupy(copy, order)
+
+    # if no GPU, use NumPy
+    return self.to_numpy(copy, order)
 
 
 def register_Array4_extension(amr):
@@ -144,4 +193,5 @@ def register_Array4_extension(amr):
     ):
         Array4_type.to_numpy = array4_to_numpy
         Array4_type.to_cupy = array4_to_cupy
+        Array4_type.to_dpnp = array4_to_dpnp
         Array4_type.to_xp = array4_to_xp

From ccce85c8edac2a8346b4590db0a2c663aba2fe6c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 26 Jul 2025 06:06:41 +0000
Subject: [PATCH 07/16] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/amrex/extensions/Array4.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amrex/extensions/Array4.py b/src/amrex/extensions/Array4.py
index 6bf673b6..64b06fab 100644
--- a/src/amrex/extensions/Array4.py
+++ b/src/amrex/extensions/Array4.py
@@ -170,7 +170,7 @@ def array4_to_xp(self, copy=False, order="F"):
     amr = inspect.getmodule(self)
 
     if amr.Config.have_gpu:
-        if amr.Config.gpu_backend == 'SYCL':
+        if amr.Config.gpu_backend == "SYCL":
             return self.to_dpnp(copy, order)
         else:  # if not SYCL use cupy
             return self.to_cupy(copy, order)

From 25d3b348730bd517eba05354775c2a7557ebad33 Mon Sep 17 00:00:00 2001
From: Roelof Groenewald <rgroenewald@tae.com>
Date: Sat, 26 Jul 2025 00:14:03 -0700
Subject: [PATCH 08/16] update `MultiFab.py` to also support SYCL backend

Signed-off-by: Roelof Groenewald <rgroenewald@tae.com>
---
 src/amrex/extensions/MultiFab.py | 80 ++++++++++++++++++++++++++------
 src/dlpack.h                     |  2 +-
 2 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py
index 61b5d159..0fe7c214 100644
--- a/src/amrex/extensions/MultiFab.py
+++ b/src/amrex/extensions/MultiFab.py
@@ -103,10 +103,50 @@ def mf_to_cupy(self, copy=False, order="F"):
     return views
 
 
+def mf_to_dpnp(self, copy=False, order="F"):
+    """
+    Provide a dpnp view into a MultiFab.
+
+    This includes ngrow guard cells of each box.
+
+    Note on the order of indices:
+    By default, this is as in AMReX in Fortran contiguous order, indexing as
+    x,y,z. This has performance implications for use in external libraries such
+    as cupy.
+    The order="C" option will index as z,y,x and perform may better.
+    https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074
+
+    Parameters
+    ----------
+    self : amrex.MultiFab
+        A MultiFab class in pyAMReX
+    copy : bool, optional
+        Copy the data if true, otherwise create a view (default).
+    order : string, optional
+        F order (default) or C. C is faster with external libraries.
+
+    Returns
+    -------
+    list of dpnp.array
+        A list of dpnp n-dimensional arrays, for each local block in the
+        MultiFab.
+
+    Raises
+    ------
+    ImportError
+        Raises an exception if dpnp is not installed
+    """
+    views = []
+    for mfi in self:
+        views.append(self.array(mfi).to_dpnp(copy, order))
+
+    return views
+
+
 def mf_to_xp(self, copy=False, order="F"):
     """
-    Provide a NumPy or CuPy view into a MultiFab,
-    depending on amr.Config.have_gpu .
+    Provide a NumPy, CuPy or dpnp view into a MultiFab,
+    depending on amr.Config.have_gpu and amr.Config.gpu_backend .
 
     This function is similar to CuPy's xp naming suggestion for CPU/GPU agnostic code:
     https://docs.cupy.dev/en/stable/user_guide/basic.html#how-to-write-cpu-gpu-agnostic-code
@@ -132,15 +172,21 @@ def mf_to_xp(self, copy=False, order="F"):
     Returns
     -------
     list of xp.array
-        A list of NumPy or CuPy n-dimensional arrays, for each local block in the
-        MultiFab.
+        A list of NumPy, CuPy or dpnp n-dimensional arrays, for each local block
+        in the MultiFab.
     """
     import inspect
 
     amr = inspect.getmodule(self)
-    return (
-        self.to_cupy(copy, order) if amr.Config.have_gpu else self.to_numpy(copy, order)
-    )
+
+    if amr.Config.have_gpu:
+        if amr.Config.gpu_backend == "SYCL":
+            return self.to_dpnp(copy, order)
+        else:  # if not SYCL use cupy
+            return self.to_cupy(copy, order)
+
+    # if no GPU, use NumPy
+    return self.to_numpy(copy, order)
 
 
 def copy_multifab(amr, self):
@@ -490,6 +536,9 @@ def __getitem__(self, index, with_internal_ghosts=False):
         Whether to include internal ghost cells. When true, data from ghost cells may be used that
         overlaps valid cells.
     """
+    import inspect
+    amr = inspect.getmodule(self)
+
     index4 = _process_index(self, index)
 
     # Gather the data to be included in a list to be sent to other processes
@@ -503,17 +552,18 @@ def __getitem__(self, index, with_internal_ghosts=False):
             device_arr = _get_field(self, mfi)
             slice_arr = device_arr[block_slices]
             try:
-                # Copy data from host to device using cupy syntax
-                slice_arr = slice_arr.get()
+                if amr.Config.gpu_backend == "SYCL":
+                    import dpnp
+                    slice_arr = dpnp.asnumpy(slice_arr)
+                else:
+                    # Copy data from host to device using cupy syntax
+                    slice_arr = slice_arr.get()
             except AttributeError:
                 # Array is already a numpy array on the host
                 pass
             datalist.append((global_slices, slice_arr))
 
     # Gather the data from all processors
-    import inspect
-
-    amr = inspect.getmodule(self)
     if amr.Config.have_mpi:
         npes = amr.ParallelDescriptor.NProcs()
     else:
@@ -604,7 +654,10 @@ def __setitem__(self, index, value):
 
     amr = inspect.getmodule(self)
     if amr.Config.have_gpu:
-        import cupy as xp
+        if amr.Config.gpu_backend == "SYCL":
+            import dpnp as xp
+        else:
+            import cupy as xp
     else:
         xp = np
 
@@ -654,6 +707,7 @@ def register_MultiFab_extension(amr):
 
     amr.MultiFab.to_numpy = mf_to_numpy
     amr.MultiFab.to_cupy = mf_to_cupy
+    amr.MultiFab.to_dpnp = mf_to_dpnp
     amr.MultiFab.to_xp = mf_to_xp
 
     amr.MultiFab.copy = lambda self: copy_multifab(amr, self)
diff --git a/src/dlpack.h b/src/dlpack.h
index d9054876..e25fb19f 100644
--- a/src/dlpack.h
+++ b/src/dlpack.h
@@ -197,7 +197,7 @@ typedef struct {
    * types. This pointer is always aligned to 256 bytes as in CUDA. The
    * `byte_offset` field should be used to point to the beginning of the data.
    *
-   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch, TensorFlow,
    * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
    * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
    * (after which this note will be updated); at the moment it is recommended

From 228f940e557483df30e8c6b88bb332503d4d0fb8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 26 Jul 2025 07:14:19 +0000
Subject: [PATCH 09/16] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/amrex/extensions/MultiFab.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py
index 0fe7c214..776cc5d3 100644
--- a/src/amrex/extensions/MultiFab.py
+++ b/src/amrex/extensions/MultiFab.py
@@ -537,6 +537,7 @@ def __getitem__(self, index, with_internal_ghosts=False):
         overlaps valid cells.
     """
     import inspect
+
     amr = inspect.getmodule(self)
 
     index4 = _process_index(self, index)
@@ -554,6 +555,7 @@ def __getitem__(self, index, with_internal_ghosts=False):
             try:
                 if amr.Config.gpu_backend == "SYCL":
                     import dpnp
+
                     slice_arr = dpnp.asnumpy(slice_arr)
                 else:
                     # Copy data from host to device using cupy syntax

From e700c25bd02d5ec8c06402d7ab023f3bed5d2677 Mon Sep 17 00:00:00 2001
From: Roelof Groenewald <rgroenewald@tae.com>
Date: Sat, 26 Jul 2025 09:33:00 -0700
Subject: [PATCH 10/16] use `array4_to_xp` in `MultiFab.py` to avoid needing
 `mf_to_cupy` and `mf_to_dpnp`

Signed-off-by: Roelof Groenewald <rgroenewald@tae.com>
---
 src/amrex/extensions/MultiFab.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py
index 776cc5d3..9d0d1d90 100644
--- a/src/amrex/extensions/MultiFab.py
+++ b/src/amrex/extensions/MultiFab.py
@@ -175,18 +175,11 @@ def mf_to_xp(self, copy=False, order="F"):
         A list of NumPy, CuPy or dpnp n-dimensional arrays, for each local block
         in the MultiFab.
     """
-    import inspect
-
-    amr = inspect.getmodule(self)
-
-    if amr.Config.have_gpu:
-        if amr.Config.gpu_backend == "SYCL":
-            return self.to_dpnp(copy, order)
-        else:  # if not SYCL use cupy
-            return self.to_cupy(copy, order)
+    views = []
+    for mfi in self:
+        views.append(self.array(mfi).to_xp(copy, order))
 
-    # if no GPU, use NumPy
-    return self.to_numpy(copy, order)
+    return views
 
 
 def copy_multifab(amr, self):

From 13f68c9d8cb41b4ea379dbd5ee6118edd244c343 Mon Sep 17 00:00:00 2001
From: Roelof Groenewald <rgroenewald@tae.com>
Date: Sat, 26 Jul 2025 11:27:34 -0700
Subject: [PATCH 11/16] add specific optional arguments to `Array4.__dlpack__`

Signed-off-by: Roelof Groenewald <rgroenewald@tae.com>
---
 src/Base/Array4.H | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/Base/Array4.H b/src/Base/Array4.H
index bf3979ca..91bc7f07 100644
--- a/src/Base/Array4.H
+++ b/src/Base/Array4.H
@@ -231,13 +231,12 @@ namespace pyAMReX
             // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol
             .def("__dlpack__", [](
                     Array4<T> const &a4,
-                    py::kwargs kwargs
-                    /* TODO: Handle keyword arguments
-                    [[maybe_unused]] py::handle stream,
-                    [[maybe_unused]] std::tuple<int, int> max_version,
-                    [[maybe_unused]] std::tuple<DLDeviceType, int32_t> dl_device,
-                    [[maybe_unused]] bool copy
-                    */
+                    /* TODO: Handle keyword arguments */
+                    [[maybe_unused]] std::optional<py::handle> stream = std::nullopt,
+                    [[maybe_unused]] std::optional<std::tuple<int, int>> max_version = std::nullopt,
+                    [[maybe_unused]] std::optional<std::tuple<DLDeviceType, int32_t>> dl_device = std::nullopt,
+                    [[maybe_unused]] std::optional<bool> copy = std::nullopt
+
             )
             {
                 // Allocate shape/strides arrays
@@ -281,8 +280,10 @@ namespace pyAMReX
                     }
                 );
             },
-                //py::arg("stream") = py::none(),
-                // ... other args & their defaults
+                py::arg("stream") = py::none(),
+                py::arg("max_version") = py::none(),
+                py::arg("dl_device") = py::none(),
+                py::arg("copy") = py::none(),
                 R"doc(
                 DLPack protocol for zero-copy tensor exchange.
                 See https://dmlc.github.io/dlpack/latest/ for details.

From 728fa4a986ba938c307b25ed28f8e394c2ac23bb Mon Sep 17 00:00:00 2001
From: Roelof Groenewald <rgroenewald@tae.com>
Date: Sun, 27 Jul 2025 17:46:19 -0700
Subject: [PATCH 12/16] remove `mf_to_cupy` and `mf_to_dpnp`

Signed-off-by: Roelof Groenewald <rgroenewald@tae.com>
---
 src/amrex/extensions/MultiFab.py | 83 --------------------------------
 1 file changed, 83 deletions(-)

diff --git a/src/amrex/extensions/MultiFab.py b/src/amrex/extensions/MultiFab.py
index 9d0d1d90..dea96244 100644
--- a/src/amrex/extensions/MultiFab.py
+++ b/src/amrex/extensions/MultiFab.py
@@ -63,86 +63,6 @@ def mf_to_numpy(self, copy=False, order="F"):
     return views
 
 
-def mf_to_cupy(self, copy=False, order="F"):
-    """
-    Provide a CuPy view into a MultiFab.
-
-    This includes ngrow guard cells of each box.
-
-    Note on the order of indices:
-    By default, this is as in AMReX in Fortran contiguous order, indexing as
-    x,y,z. This has performance implications for use in external libraries such
-    as cupy.
-    The order="C" option will index as z,y,x and perform better with cupy.
-    https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074
-
-    Parameters
-    ----------
-    self : amrex.MultiFab
-        A MultiFab class in pyAMReX
-    copy : bool, optional
-        Copy the data if true, otherwise create a view (default).
-    order : string, optional
-        F order (default) or C. C is faster with external libraries.
-
-    Returns
-    -------
-    list of cupy.array
-        A list of CuPy n-dimensional arrays, for each local block in the
-        MultiFab.
-
-    Raises
-    ------
-    ImportError
-        Raises an exception if cupy is not installed
-    """
-    views = []
-    for mfi in self:
-        views.append(self.array(mfi).to_cupy(copy, order))
-
-    return views
-
-
-def mf_to_dpnp(self, copy=False, order="F"):
-    """
-    Provide a dpnp view into a MultiFab.
-
-    This includes ngrow guard cells of each box.
-
-    Note on the order of indices:
-    By default, this is as in AMReX in Fortran contiguous order, indexing as
-    x,y,z. This has performance implications for use in external libraries such
-    as cupy.
-    The order="C" option will index as z,y,x and perform may better.
-    https://github.com/AMReX-Codes/pyamrex/issues/55#issuecomment-1579610074
-
-    Parameters
-    ----------
-    self : amrex.MultiFab
-        A MultiFab class in pyAMReX
-    copy : bool, optional
-        Copy the data if true, otherwise create a view (default).
-    order : string, optional
-        F order (default) or C. C is faster with external libraries.
-
-    Returns
-    -------
-    list of dpnp.array
-        A list of dpnp n-dimensional arrays, for each local block in the
-        MultiFab.
-
-    Raises
-    ------
-    ImportError
-        Raises an exception if dpnp is not installed
-    """
-    views = []
-    for mfi in self:
-        views.append(self.array(mfi).to_dpnp(copy, order))
-
-    return views
-
-
 def mf_to_xp(self, copy=False, order="F"):
     """
     Provide a NumPy, CuPy or dpnp view into a MultiFab,
@@ -701,8 +621,6 @@ def register_MultiFab_extension(amr):
     amr.MultiFab.__iter__ = lambda mfab: amr.MFIter(mfab)
 
     amr.MultiFab.to_numpy = mf_to_numpy
-    amr.MultiFab.to_cupy = mf_to_cupy
-    amr.MultiFab.to_dpnp = mf_to_dpnp
     amr.MultiFab.to_xp = mf_to_xp
 
     amr.MultiFab.copy = lambda self: copy_multifab(amr, self)
@@ -718,7 +636,6 @@ def register_MultiFab_extension(amr):
     amr.iMultiFab.__iter__ = lambda imfab: amr.MFIter(imfab)
 
     amr.iMultiFab.to_numpy = mf_to_numpy
-    amr.iMultiFab.to_cupy = mf_to_cupy
     amr.iMultiFab.to_xp = mf_to_xp
 
     amr.iMultiFab.copy = lambda self: copy_multifab(amr, self)

From 9b942d2ccb444d7f16a0e1277a5e8afbdce5faaa Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Mon, 28 Jul 2025 08:46:08 -0700
Subject: [PATCH 13/16] Bind `DLDeviceType`

---
 src/Base/Array4.H         |  2 +-
 src/CMakeLists.txt        |  1 +
 src/dlpack/CMakeLists.txt |  6 ++++++
 src/dlpack/DLPack.cpp     | 34 ++++++++++++++++++++++++++++++++++
 src/{ => dlpack}/dlpack.h |  0
 src/pyAMReX.cpp           |  2 ++
 6 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 src/dlpack/CMakeLists.txt
 create mode 100644 src/dlpack/DLPack.cpp
 rename src/{ => dlpack}/dlpack.h (100%)

diff --git a/src/Base/Array4.H b/src/Base/Array4.H
index 91bc7f07..5f03c381 100644
--- a/src/Base/Array4.H
+++ b/src/Base/Array4.H
@@ -6,7 +6,7 @@
 #pragma once
 
 #include "pyAMReX.H"
-#include "dlpack.h"
+#include "dlpack/dlpack.h"
 
 #include <AMReX_Array4.H>
 #include <AMReX_BLassert.H>
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 97a3e483..d8713f2f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_subdirectory(AmrCore)
 add_subdirectory(Base)
 #add_subdirectory(Boundary)
+add_subdirectory(dlpack)
 #add_subdirectory(EB)
 #add_subdirectory(Extern)
 #add_subdirectory(LinearSolvers)
diff --git a/src/dlpack/CMakeLists.txt b/src/dlpack/CMakeLists.txt
new file mode 100644
index 00000000..2a27f9a7
--- /dev/null
+++ b/src/dlpack/CMakeLists.txt
@@ -0,0 +1,6 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    target_sources(pyAMReX_${D}d
+        PRIVATE
+            DLPack.cpp
+    )
+endforeach()
diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp
new file mode 100644
index 00000000..eaad9f3c
--- /dev/null
+++ b/src/dlpack/DLPack.cpp
@@ -0,0 +1,34 @@
+#include "pyAMReX.H"
+
+#include "dlpack.h"
+
+
+void init_DLPack(py::module& m)
+{
+    using namespace amrex;
+
+    // register types only if not already present, e.g., from another library
+    // that also implements DLPack bindings and exposes the types
+
+    py::type pyDLDeviceType = py::type::of<DLDeviceType>();
+    if (!pyDLDeviceType) {
+        py::native_enum<DLDeviceType>(m, "DLDeviceType", "enum.IntEnum")
+            .value("kDLCPU", DLDeviceType::kDLCPU)
+            .value("kDLCUDA", DLDeviceType::kDLCUDA)
+            .value("kDLCUDAHost", DLDeviceType::kDLCUDAHost)
+            .value("kDLOpenCL", DLDeviceType::kDLOpenCL)
+            .value("kDLVulkan", DLDeviceType::kDLVulkan)
+            .value("kDLMetal", DLDeviceType::kDLMetal)
+            .value("kDLVPI", DLDeviceType::kDLVPI)
+            .value("kDLROCM", DLDeviceType::kDLROCM)
+            .value("kDLROCMHost", DLDeviceType::kDLROCMHost)
+            .value("kDLExtDev", DLDeviceType::kDLExtDev)
+            .value("kDLCUDAManaged", DLDeviceType::kDLCUDAManaged)
+            .value("kDLOneAPI", DLDeviceType::kDLOneAPI)
+            .value("kDLWebGPU", DLDeviceType::kDLWebGPU)
+            .value("kDLHexagon", DLDeviceType::kDLHexagon)
+            .value("kDLMAIA", DLDeviceType::kDLMAIA)
+        ;
+    }
+
+}
diff --git a/src/dlpack.h b/src/dlpack/dlpack.h
similarity index 100%
rename from src/dlpack.h
rename to src/dlpack/dlpack.h
diff --git a/src/pyAMReX.cpp b/src/pyAMReX.cpp
index 36ce03d0..ab9ccfa9 100644
--- a/src/pyAMReX.cpp
+++ b/src/pyAMReX.cpp
@@ -20,6 +20,7 @@ void init_Arena(py::module&);
 void init_Array4(py::module&);
 void init_BaseFab(py::module&);
 void init_Box(py::module &);
+void init_DLPack(py::module &);
 void init_RealBox(py::module &);
 void init_BoxArray(py::module &);
 void init_CoordSys(py::module&);
@@ -98,6 +99,7 @@ PYBIND11_MODULE(amrex_3d_pybind, m) {
 
     // note: order from parent to child classes and argument usage
 
+    init_DLPack(m);
     init_AMReX(m);
     init_Arena(m);
     init_Dim3(m);

From 63a007168b0d9c5b435c68402e82a6b6396454e4 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Mon, 28 Jul 2025 14:47:39 -0700
Subject: [PATCH 14/16] Simplify

Signed-off-by: Axel Huebl <axel.huebl@plasma.ninja>
---
 src/dlpack/DLPack.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp
index eaad9f3c..493ec1e0 100644
--- a/src/dlpack/DLPack.cpp
+++ b/src/dlpack/DLPack.cpp
@@ -10,7 +10,7 @@ void init_DLPack(py::module& m)
     // register types only if not already present, e.g., from another library
     // that also implements DLPack bindings and exposes the types
 
-    py::type pyDLDeviceType = py::type::of<DLDeviceType>();
+    py::type pyDLDeviceType = false;  // TODO: py::type::of<DLDeviceType>();
     if (!pyDLDeviceType) {
         py::native_enum<DLDeviceType>(m, "DLDeviceType", "enum.IntEnum")
             .value("kDLCPU", DLDeviceType::kDLCPU)

From ac9e5bc0c5a024522f0aa20d247be272a909436a Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Mon, 28 Jul 2025 14:48:33 -0700
Subject: [PATCH 15/16] Simplify

Signed-off-by: Axel Huebl <axel.huebl@plasma.ninja>
---
 src/dlpack/DLPack.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp
index 493ec1e0..29d806a9 100644
--- a/src/dlpack/DLPack.cpp
+++ b/src/dlpack/DLPack.cpp
@@ -10,7 +10,8 @@ void init_DLPack(py::module& m)
     // register types only if not already present, e.g., from another library
     // that also implements DLPack bindings and exposes the types
 
-    py::type pyDLDeviceType = false;  // TODO: py::type::of<DLDeviceType>();
+    // TODO: py::type pyDLDeviceType = py::type::of<DLDeviceType>();
+    bool pyDLDeviceType = false;
     if (!pyDLDeviceType) {
         py::native_enum<DLDeviceType>(m, "DLDeviceType", "enum.IntEnum")
             .value("kDLCPU", DLDeviceType::kDLCPU)

From b6e03919d5fe29d593d0a977a4f0f977702a4548 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Fri, 7 Nov 2025 14:48:11 -0800
Subject: [PATCH 16/16] Fix enum: export/finalize

Signed-off-by: Axel Huebl <axel.huebl@plasma.ninja>
---
 src/dlpack/DLPack.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/dlpack/DLPack.cpp b/src/dlpack/DLPack.cpp
index 29d806a9..72a1a8e2 100644
--- a/src/dlpack/DLPack.cpp
+++ b/src/dlpack/DLPack.cpp
@@ -29,6 +29,8 @@ void init_DLPack(py::module& m)
             .value("kDLWebGPU", DLDeviceType::kDLWebGPU)
             .value("kDLHexagon", DLDeviceType::kDLHexagon)
             .value("kDLMAIA", DLDeviceType::kDLMAIA)
+            .export_values()
+            .finalize()
         ;
     }